From e74896e4bc71455341346b9ef755728d4161a3c7 Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 15 Oct 2007 17:01:41 +0000 Subject: [PATCH] remove branches/sage/pgs git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1942 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/pgs/COPYING | 504 -- branches/sage/pgs/Makefile | 275 - branches/sage/pgs/README | 4 - branches/sage/pgs/TODO | 339 - branches/sage/pgs/cfuse.cc | 84 - branches/sage/pgs/client/Client.cc | 3079 --------- branches/sage/pgs/client/Client.h | 682 -- branches/sage/pgs/client/FileCache.cc | 264 - branches/sage/pgs/client/FileCache.h | 85 - branches/sage/pgs/client/SyntheticClient.cc | 1967 ------ branches/sage/pgs/client/SyntheticClient.h | 231 - branches/sage/pgs/client/Trace.cc | 126 - branches/sage/pgs/client/Trace.h | 76 - branches/sage/pgs/client/fuse.cc | 281 - branches/sage/pgs/client/fuse.h | 24 - .../sage/pgs/client/hadoop/CephFSInterface.cc | 789 --- .../sage/pgs/client/hadoop/CephFSInterface.h | 239 - branches/sage/pgs/client/ldceph.cc | 298 - branches/sage/pgs/client/msgthread.h | 26 - branches/sage/pgs/cmds.cc | 103 - branches/sage/pgs/cmon.cc | 129 - branches/sage/pgs/cmonctl.cc | 92 - branches/sage/pgs/common/Clock.cc | 20 - branches/sage/pgs/common/Clock.h | 104 - branches/sage/pgs/common/Cond.h | 119 - branches/sage/pgs/common/DecayCounter.h | 95 - branches/sage/pgs/common/LogType.h | 112 - branches/sage/pgs/common/Logger.cc | 217 - branches/sage/pgs/common/Logger.h | 75 - branches/sage/pgs/common/Mutex.h | 83 - branches/sage/pgs/common/Semaphore.h | 53 - branches/sage/pgs/common/Thread.h | 77 - branches/sage/pgs/common/ThreadPool.h | 139 - branches/sage/pgs/common/Timer.cc | 334 - branches/sage/pgs/common/Timer.h | 174 - branches/sage/pgs/config.cc | 903 --- branches/sage/pgs/config.h | 390 -- branches/sage/pgs/cosd.cc | 135 - branches/sage/pgs/crush/BinaryTree.h | 285 - branches/sage/pgs/crush/Bucket.h | 632 -- branches/sage/pgs/crush/Hash.h | 301 - branches/sage/pgs/crush/crush.h | 535 -- .../sage/pgs/crush/test/bucket_movement.cc | 166 - .../sage/pgs/crush/test/bucket_variance.cc | 199 - .../sage/pgs/crush/test/cluster_movement.cc | 217 - .../pgs/crush/test/cluster_movement_remove.cc | 229 - .../pgs/crush/test/cluster_movement_rush.cc | 218 - .../sage/pgs/crush/test/creeping_failure.cc | 276 - .../crush/test/creeping_failure_variance.cc | 281 - .../sage/pgs/crush/test/depth_variance.cc | 185 - branches/sage/pgs/crush/test/mixed.cc | 300 - branches/sage/pgs/crush/test/movement.cc | 223 - .../sage/pgs/crush/test/movement_failed.cc | 246 - branches/sage/pgs/crush/test/overload.cc | 335 - .../sage/pgs/crush/test/overload_variance.cc | 281 - branches/sage/pgs/crush/test/sizes.cc | 131 - branches/sage/pgs/crush/test/smallbucket.cc | 138 - branches/sage/pgs/crush/test/speed_bucket.cc | 86 - branches/sage/pgs/crush/test/speed_depth.cc | 174 - branches/sage/pgs/crush/test/speed_rush.cc | 145 - branches/sage/pgs/crush/test/t.cc | 25 - branches/sage/pgs/crush/test/testbucket.cc | 61 - branches/sage/pgs/crush/test/testnormal.cc | 51 - branches/sage/pgs/csyn.cc | 102 - branches/sage/pgs/doc/Commitdir.txt | 24 - branches/sage/pgs/doc/anchortable.txt | 54 - branches/sage/pgs/doc/bdb.txt | 48 - branches/sage/pgs/doc/caching.txt | 303 - branches/sage/pgs/doc/dentries.txt | 4 - branches/sage/pgs/doc/exports.txt | 72 - branches/sage/pgs/doc/file_modes.txt | 66 - branches/sage/pgs/doc/header.txt | 13 - branches/sage/pgs/doc/inos.txt | 11 - branches/sage/pgs/doc/journal.txt | 124 - branches/sage/pgs/doc/lazy_posix.txt | 53 - branches/sage/pgs/doc/mds_locks.txt | 66 - branches/sage/pgs/doc/modeline.txt | 2 - branches/sage/pgs/doc/osd_outline.txt | 37 - branches/sage/pgs/doc/osd_replication.txt | 226 - .../sage/pgs/doc/shared_write_states_nogo.txt | 39 - branches/sage/pgs/doc/shutdown.txt | 13 - branches/sage/pgs/ebofs/Allocator.cc | 693 --- branches/sage/pgs/ebofs/Allocator.h | 86 - branches/sage/pgs/ebofs/BlockDevice.cc | 780 --- branches/sage/pgs/ebofs/BlockDevice.h | 339 - branches/sage/pgs/ebofs/BufferCache.cc | 1148 ---- branches/sage/pgs/ebofs/BufferCache.h | 710 --- branches/sage/pgs/ebofs/Cnode.h | 101 - branches/sage/pgs/ebofs/Ebofs.cc | 3458 ---------- branches/sage/pgs/ebofs/Ebofs.h | 360 -- branches/sage/pgs/ebofs/FileJournal.cc | 456 -- branches/sage/pgs/ebofs/FileJournal.h | 144 - branches/sage/pgs/ebofs/Journal.h | 46 - branches/sage/pgs/ebofs/Onode.h | 391 -- branches/sage/pgs/ebofs/Table.h | 899 --- branches/sage/pgs/ebofs/mkfs.ebofs.cc | 300 - branches/sage/pgs/ebofs/nodes.h | 584 -- branches/sage/pgs/ebofs/test.ebofs.cc | 226 - branches/sage/pgs/ebofs/types.h | 170 - branches/sage/pgs/fakefuse.cc | 157 - branches/sage/pgs/fakesyn.cc | 197 - branches/sage/pgs/include/Context.h | 153 - branches/sage/pgs/include/Distribution.h | 75 - branches/sage/pgs/include/blobhash.h | 45 - branches/sage/pgs/include/buffer.h | 982 --- branches/sage/pgs/include/encodable.h | 172 - branches/sage/pgs/include/error.h | 41 - branches/sage/pgs/include/filepath.h | 184 - branches/sage/pgs/include/frag.h | 255 - branches/sage/pgs/include/interval_set.h | 306 - branches/sage/pgs/include/lru.h | 323 - branches/sage/pgs/include/object.h | 103 - branches/sage/pgs/include/oldbuffer.h | 358 -- branches/sage/pgs/include/oldbufferlist.h | 682 -- branches/sage/pgs/include/rangeset.h | 253 - branches/sage/pgs/include/statlite.h | 72 - branches/sage/pgs/include/types.h | 336 - branches/sage/pgs/include/uofs.h | 51 - branches/sage/pgs/include/utime.h | 147 - branches/sage/pgs/jobs/alc.tp | 38 - branches/sage/pgs/jobs/alcdat/makedirs | 45 - branches/sage/pgs/jobs/alcdat/makedirs.big | 45 - branches/sage/pgs/jobs/alcdat/makedirs.tput | 46 - .../sage/pgs/jobs/alcdat/makefiles.shared | 32 - branches/sage/pgs/jobs/alcdat/openshared | 32 - branches/sage/pgs/jobs/alcdat/ossh.include | 45 - .../sage/pgs/jobs/alcdat/ossh.include.big | 46 - branches/sage/pgs/jobs/alcdat/ossh.lib | 45 - branches/sage/pgs/jobs/alcdat/ossh.lib.big | 46 - branches/sage/pgs/jobs/alcdat/striping | 48 - branches/sage/pgs/jobs/example | 56 - branches/sage/pgs/jobs/mds/log_striping | 36 - branches/sage/pgs/jobs/mds/makedir_lat | 33 - branches/sage/pgs/jobs/mds/makedirs | 40 - branches/sage/pgs/jobs/mds/opensshlib | 44 - branches/sage/pgs/jobs/meta1 | 19 - branches/sage/pgs/jobs/meta1.proc.sh | 14 - branches/sage/pgs/jobs/osd/ebofs | 51 - branches/sage/pgs/jobs/osd/mds_log | 43 - branches/sage/pgs/jobs/osd/osd_threads | 33 - branches/sage/pgs/jobs/osd/striping | 78 - branches/sage/pgs/jobs/osd/wr_lat2 | 44 - branches/sage/pgs/jobs/osd/write_sizes | 60 - branches/sage/pgs/jobs/rados/map_dist | 32 - branches/sage/pgs/jobs/rados/rep_lat | 43 - branches/sage/pgs/jobs/rados/wr_sizes | 50 - branches/sage/pgs/mds/Anchor.h | 108 - branches/sage/pgs/mds/AnchorClient.cc | 372 -- branches/sage/pgs/mds/AnchorClient.h | 95 - branches/sage/pgs/mds/AnchorTable.cc | 715 --- branches/sage/pgs/mds/AnchorTable.h | 127 - branches/sage/pgs/mds/CDentry.cc | 321 - branches/sage/pgs/mds/CDentry.h | 291 - branches/sage/pgs/mds/CDir.cc | 1423 ----- branches/sage/pgs/mds/CDir.h | 576 -- branches/sage/pgs/mds/CInode.cc | 590 -- branches/sage/pgs/mds/CInode.h | 659 -- branches/sage/pgs/mds/Capability.h | 246 - branches/sage/pgs/mds/ClientMap.cc | 121 - branches/sage/pgs/mds/ClientMap.h | 188 - branches/sage/pgs/mds/FileLock.h | 227 - branches/sage/pgs/mds/Hasher.cc | 1582 ----- branches/sage/pgs/mds/IdAllocator.cc | 198 - branches/sage/pgs/mds/IdAllocator.h | 77 - branches/sage/pgs/mds/LocalLock.h | 61 - branches/sage/pgs/mds/Locker.cc | 2781 --------- branches/sage/pgs/mds/Locker.h | 183 - branches/sage/pgs/mds/LogEvent.cc | 80 - branches/sage/pgs/mds/LogEvent.h | 104 - branches/sage/pgs/mds/MDBalancer.cc | 910 --- branches/sage/pgs/mds/MDBalancer.h | 110 - branches/sage/pgs/mds/MDCache.cc | 5541 ----------------- branches/sage/pgs/mds/MDCache.h | 625 -- branches/sage/pgs/mds/MDLog.cc | 476 -- branches/sage/pgs/mds/MDLog.h | 172 - branches/sage/pgs/mds/MDS.cc | 1320 ---- branches/sage/pgs/mds/MDS.h | 296 - branches/sage/pgs/mds/MDSMap.h | 343 - branches/sage/pgs/mds/Migrator.cc | 1988 ------ branches/sage/pgs/mds/Migrator.h | 259 - branches/sage/pgs/mds/Renamer.cc | 905 --- branches/sage/pgs/mds/Renamer.h | 99 - branches/sage/pgs/mds/ScatterLock.h | 174 - branches/sage/pgs/mds/Server.cc | 3762 ----------- branches/sage/pgs/mds/Server.h | 178 - branches/sage/pgs/mds/SimpleLock.h | 301 - branches/sage/pgs/mds/events/EAnchor.h | 82 - branches/sage/pgs/mds/events/EAnchorClient.h | 58 - branches/sage/pgs/mds/events/EExport.h | 64 - branches/sage/pgs/mds/events/EImportFinish.h | 60 - branches/sage/pgs/mds/events/EImportStart.h | 61 - branches/sage/pgs/mds/events/EMetaBlob.h | 445 -- branches/sage/pgs/mds/events/EOpen.h | 53 - branches/sage/pgs/mds/events/EPurgeFinish.h | 54 - branches/sage/pgs/mds/events/ESession.h | 64 - branches/sage/pgs/mds/events/ESlaveUpdate.h | 70 - branches/sage/pgs/mds/events/EString.h | 57 - branches/sage/pgs/mds/events/ESubtreeMap.h | 47 - branches/sage/pgs/mds/events/EUpdate.h | 50 - branches/sage/pgs/mds/journal.cc | 1007 --- branches/sage/pgs/mds/mdstypes.h | 584 -- branches/sage/pgs/messages/MAnchor.h | 74 - branches/sage/pgs/messages/MCacheExpire.h | 127 - branches/sage/pgs/messages/MClientFileCaps.h | 109 - branches/sage/pgs/messages/MClientMount.h | 40 - branches/sage/pgs/messages/MClientReconnect.h | 59 - branches/sage/pgs/messages/MClientReply.h | 294 - branches/sage/pgs/messages/MClientRequest.h | 315 - .../sage/pgs/messages/MClientRequestForward.h | 59 - branches/sage/pgs/messages/MClientSession.h | 62 - branches/sage/pgs/messages/MClientUnmount.h | 40 - branches/sage/pgs/messages/MDentryUnlink.h | 82 - branches/sage/pgs/messages/MDirUpdate.h | 71 - branches/sage/pgs/messages/MDiscover.h | 107 - branches/sage/pgs/messages/MDiscoverReply.h | 276 - branches/sage/pgs/messages/MExportDir.h | 68 - branches/sage/pgs/messages/MExportDirAck.h | 46 - branches/sage/pgs/messages/MExportDirCancel.h | 49 - .../sage/pgs/messages/MExportDirDiscover.h | 60 - .../sage/pgs/messages/MExportDirDiscoverAck.h | 60 - branches/sage/pgs/messages/MExportDirFinish.h | 46 - branches/sage/pgs/messages/MExportDirNotify.h | 85 - .../sage/pgs/messages/MExportDirNotifyAck.h | 50 - branches/sage/pgs/messages/MExportDirPrep.h | 189 - .../sage/pgs/messages/MExportDirPrepAck.h | 47 - .../sage/pgs/messages/MExportDirWarning.h | 50 - .../sage/pgs/messages/MExportDirWarningAck.h | 45 - branches/sage/pgs/messages/MGenericMessage.h | 45 - branches/sage/pgs/messages/MHeartbeat.h | 60 - branches/sage/pgs/messages/MInodeFileCaps.h | 57 - branches/sage/pgs/messages/MLock.h | 128 - branches/sage/pgs/messages/MMDSBeacon.h | 59 - branches/sage/pgs/messages/MMDSBoot.h | 39 - branches/sage/pgs/messages/MMDSCacheRejoin.h | 237 - branches/sage/pgs/messages/MMDSGetMap.h | 39 - branches/sage/pgs/messages/MMDSMap.h | 79 - branches/sage/pgs/messages/MMDSResolve.h | 66 - branches/sage/pgs/messages/MMDSResolveAck.h | 56 - branches/sage/pgs/messages/MMDSSlaveRequest.h | 150 - branches/sage/pgs/messages/MMonCommand.h | 54 - branches/sage/pgs/messages/MMonCommandAck.h | 46 - branches/sage/pgs/messages/MMonElection.h | 63 - .../sage/pgs/messages/MMonElectionCollect.h | 43 - .../sage/pgs/messages/MMonElectionRefresh.h | 52 - .../sage/pgs/messages/MMonElectionStatus.h | 51 - branches/sage/pgs/messages/MMonOSDMapInfo.h | 50 - branches/sage/pgs/messages/MMonOSDMapLease.h | 50 - .../sage/pgs/messages/MMonOSDMapLeaseAck.h | 45 - .../sage/pgs/messages/MMonOSDMapUpdateAck.h | 43 - .../pgs/messages/MMonOSDMapUpdateCommit.h | 43 - .../pgs/messages/MMonOSDMapUpdatePrepare.h | 53 - branches/sage/pgs/messages/MMonPaxos.h | 98 - branches/sage/pgs/messages/MOSDBoot.h | 51 - branches/sage/pgs/messages/MOSDFailure.h | 55 - branches/sage/pgs/messages/MOSDGetMap.h | 48 - branches/sage/pgs/messages/MOSDIn.h | 43 - branches/sage/pgs/messages/MOSDMap.h | 71 - branches/sage/pgs/messages/MOSDOp.h | 252 - branches/sage/pgs/messages/MOSDOpReply.h | 153 - branches/sage/pgs/messages/MOSDOut.h | 43 - branches/sage/pgs/messages/MOSDPGLog.h | 62 - branches/sage/pgs/messages/MOSDPGNotify.h | 55 - branches/sage/pgs/messages/MOSDPGPeer.h | 58 - branches/sage/pgs/messages/MOSDPGPeerAck.h | 70 - .../sage/pgs/messages/MOSDPGPeerRequest.h | 51 - branches/sage/pgs/messages/MOSDPGQuery.h | 52 - branches/sage/pgs/messages/MOSDPGRemove.h | 52 - branches/sage/pgs/messages/MOSDPGSummary.h | 66 - branches/sage/pgs/messages/MOSDPGUpdate.h | 65 - branches/sage/pgs/messages/MOSDPing.h | 58 - branches/sage/pgs/messages/MPing.h | 43 - branches/sage/pgs/messages/MPingAck.h | 42 - branches/sage/pgs/mkmonmap.cc | 68 - branches/sage/pgs/mon/ClientMonitor.cc | 237 - branches/sage/pgs/mon/ClientMonitor.h | 176 - branches/sage/pgs/mon/Elector.cc | 293 - branches/sage/pgs/mon/Elector.h | 92 - branches/sage/pgs/mon/MDSMonitor.cc | 544 -- branches/sage/pgs/mon/MDSMonitor.h | 96 - branches/sage/pgs/mon/MonMap.h | 105 - branches/sage/pgs/mon/Monitor.cc | 399 -- branches/sage/pgs/mon/Monitor.h | 149 - branches/sage/pgs/mon/MonitorStore.cc | 226 - branches/sage/pgs/mon/MonitorStore.h | 82 - branches/sage/pgs/mon/OSDMonitor.cc | 807 --- branches/sage/pgs/mon/OSDMonitor.h | 124 - branches/sage/pgs/mon/PGMap.h | 30 - branches/sage/pgs/mon/PGMonitor.cc | 58 - branches/sage/pgs/mon/PGMonitor.h | 52 - branches/sage/pgs/mon/Paxos.cc | 784 --- branches/sage/pgs/mon/Paxos.h | 250 - branches/sage/pgs/mon/PaxosService.cc | 136 - branches/sage/pgs/mon/PaxosService.h | 91 - branches/sage/pgs/mon/mon_types.h | 35 - branches/sage/pgs/msg/Dispatcher.cc | 28 - branches/sage/pgs/msg/Dispatcher.h | 34 - branches/sage/pgs/msg/FakeMessenger.cc | 409 -- branches/sage/pgs/msg/FakeMessenger.h | 97 - branches/sage/pgs/msg/HostMonitor.cc | 236 - branches/sage/pgs/msg/HostMonitor.h | 98 - branches/sage/pgs/msg/Message.cc | 345 - branches/sage/pgs/msg/Message.h | 259 - branches/sage/pgs/msg/Messenger.cc | 39 - branches/sage/pgs/msg/Messenger.h | 88 - branches/sage/pgs/msg/RWLock.h | 50 - branches/sage/pgs/msg/SerialMessenger.h | 29 - branches/sage/pgs/msg/SimpleMessenger.cc | 1221 ---- branches/sage/pgs/msg/SimpleMessenger.h | 300 - branches/sage/pgs/msg/mpistarter.cc | 63 - branches/sage/pgs/msg/msg_types.h | 191 - branches/sage/pgs/msg/new_mpistarter.cc | 45 - branches/sage/pgs/msg/tcp.cc | 89 - branches/sage/pgs/msg/tcp.h | 39 - branches/sage/pgs/newsyn.cc | 433 -- branches/sage/pgs/osbdb/OSBDB.cc | 2171 ------- branches/sage/pgs/osbdb/OSBDB.h | 482 -- branches/sage/pgs/osd/Ager.cc | 333 - branches/sage/pgs/osd/Ager.h | 44 - branches/sage/pgs/osd/BDBMap.h | 137 - branches/sage/pgs/osd/Fake.h | 250 - branches/sage/pgs/osd/FakeStore.cc | 644 -- branches/sage/pgs/osd/FakeStore.h | 111 - .../sage/pgs/osd/FakeStoreBDBCollections.h | 169 - branches/sage/pgs/osd/OBFSStore.cc | 245 - branches/sage/pgs/osd/OBFSStore.h | 57 - branches/sage/pgs/osd/OSD.cc | 2276 ------- branches/sage/pgs/osd/OSD.h | 319 - branches/sage/pgs/osd/OSDMap.h | 515 -- branches/sage/pgs/osd/ObjectStore.cc | 151 - branches/sage/pgs/osd/ObjectStore.h | 539 -- branches/sage/pgs/osd/PG.cc | 1229 ---- branches/sage/pgs/osd/PG.h | 712 --- branches/sage/pgs/osd/RAID4PG.cc | 124 - branches/sage/pgs/osd/RAID4PG.h | 74 - branches/sage/pgs/osd/ReplicatedPG.cc | 1807 ------ branches/sage/pgs/osd/ReplicatedPG.h | 169 - branches/sage/pgs/osd/osd_types.h | 276 - branches/sage/pgs/osd/rush.cc | 231 - branches/sage/pgs/osd/rush.h | 61 - branches/sage/pgs/osd/tp.cc | 81 - branches/sage/pgs/osdc/Blinker.h | 92 - branches/sage/pgs/osdc/Filer.cc | 236 - branches/sage/pgs/osdc/Filer.h | 165 - branches/sage/pgs/osdc/Journaler.cc | 620 -- branches/sage/pgs/osdc/Journaler.h | 219 - branches/sage/pgs/osdc/ObjectCacher.cc | 1557 ----- branches/sage/pgs/osdc/ObjectCacher.h | 564 -- branches/sage/pgs/osdc/Objecter.cc | 852 --- branches/sage/pgs/osdc/Objecter.h | 200 - branches/sage/pgs/script/add_header.pl | 26 - branches/sage/pgs/script/adjusttabs.pl | 24 - branches/sage/pgs/script/check_cache_dumps.pl | 56 - branches/sage/pgs/script/clean_osd_cow.sh | 3 - branches/sage/pgs/script/clean_trace.pl | 8 - branches/sage/pgs/script/comb.pl | 113 - branches/sage/pgs/script/find_auth_pins.pl | 51 - branches/sage/pgs/script/find_bufferleaks.pl | 69 - .../sage/pgs/script/find_lost_bdev_ops.pl | 34 - branches/sage/pgs/script/find_lost_commit.pl | 38 - .../sage/pgs/script/find_lost_objecter.pl | 34 - branches/sage/pgs/script/find_pathpins.pl | 41 - branches/sage/pgs/script/find_requests.pl | 42 - branches/sage/pgs/script/find_waiters.pl | 46 - branches/sage/pgs/script/fix_modeline.pl | 29 - branches/sage/pgs/script/grepblock | 15 - branches/sage/pgs/script/merge_trace_rw.pl | 42 - branches/sage/pgs/script/profonly.pl | 12 - branches/sage/pgs/script/runset.pl | 380 -- branches/sage/pgs/script/sum.pl | 148 - branches/sage/pgs/test/fakemds.cc | 104 - branches/sage/pgs/test/gprof-helper.c | 120 - branches/sage/pgs/test/makedirs.cc | 38 - branches/sage/pgs/test/mpitest.cc | 111 - branches/sage/pgs/test/mttest.cc | 140 - branches/sage/pgs/test/rushconfig | 7 - branches/sage/pgs/test/rushtest.cc | 49 - branches/sage/pgs/test/rushtest.cc~ | 49 - branches/sage/pgs/test/testbucket.cc | 67 - branches/sage/pgs/test/testbuffers.cc | 40 - branches/sage/pgs/test/testcrush.cc | 266 - branches/sage/pgs/test/testfilepath.cc | 22 - branches/sage/pgs/test/testmpi.cc | 53 - branches/sage/pgs/test/testnewbuffers.cc | 91 - branches/sage/pgs/test/testos.cc | 343 - branches/sage/pgs/test/testosbdb.cc | 347 -- branches/sage/pgs/test/testtree.cc | 46 - branches/sage/pgs/test/testxattr.cc | 31 - branches/sage/pgs/valgrind.supp | 25 - 388 files changed, 103376 deletions(-) delete mode 100644 branches/sage/pgs/COPYING delete mode 100644 branches/sage/pgs/Makefile delete mode 100644 branches/sage/pgs/README delete mode 100644 branches/sage/pgs/TODO delete mode 100644 branches/sage/pgs/cfuse.cc delete mode 100644 branches/sage/pgs/client/Client.cc delete mode 100644 branches/sage/pgs/client/Client.h delete mode 100644 branches/sage/pgs/client/FileCache.cc delete mode 100644 branches/sage/pgs/client/FileCache.h delete mode 100644 branches/sage/pgs/client/SyntheticClient.cc delete mode 100644 branches/sage/pgs/client/SyntheticClient.h delete mode 100644 branches/sage/pgs/client/Trace.cc delete mode 100644 branches/sage/pgs/client/Trace.h delete mode 100644 branches/sage/pgs/client/fuse.cc delete mode 100644 branches/sage/pgs/client/fuse.h delete mode 100644 branches/sage/pgs/client/hadoop/CephFSInterface.cc delete mode 100644 branches/sage/pgs/client/hadoop/CephFSInterface.h delete mode 100644 branches/sage/pgs/client/ldceph.cc delete mode 100644 branches/sage/pgs/client/msgthread.h delete mode 100644 branches/sage/pgs/cmds.cc delete mode 100644 branches/sage/pgs/cmon.cc delete mode 100644 branches/sage/pgs/cmonctl.cc delete mode 100644 branches/sage/pgs/common/Clock.cc delete mode 100644 branches/sage/pgs/common/Clock.h delete mode 100644 branches/sage/pgs/common/Cond.h delete mode 100644 branches/sage/pgs/common/DecayCounter.h delete mode 100644 branches/sage/pgs/common/LogType.h delete mode 100644 branches/sage/pgs/common/Logger.cc delete mode 100644 branches/sage/pgs/common/Logger.h delete mode 100755 branches/sage/pgs/common/Mutex.h delete mode 100644 branches/sage/pgs/common/Semaphore.h delete mode 100644 branches/sage/pgs/common/Thread.h delete mode 100644 branches/sage/pgs/common/ThreadPool.h delete mode 100644 branches/sage/pgs/common/Timer.cc delete mode 100644 branches/sage/pgs/common/Timer.h delete mode 100644 branches/sage/pgs/config.cc delete mode 100644 branches/sage/pgs/config.h delete mode 100644 branches/sage/pgs/cosd.cc delete mode 100644 branches/sage/pgs/crush/BinaryTree.h delete mode 100644 branches/sage/pgs/crush/Bucket.h delete mode 100644 branches/sage/pgs/crush/Hash.h delete mode 100644 branches/sage/pgs/crush/crush.h delete mode 100644 branches/sage/pgs/crush/test/bucket_movement.cc delete mode 100644 branches/sage/pgs/crush/test/bucket_variance.cc delete mode 100644 branches/sage/pgs/crush/test/cluster_movement.cc delete mode 100644 branches/sage/pgs/crush/test/cluster_movement_remove.cc delete mode 100644 branches/sage/pgs/crush/test/cluster_movement_rush.cc delete mode 100644 branches/sage/pgs/crush/test/creeping_failure.cc delete mode 100644 branches/sage/pgs/crush/test/creeping_failure_variance.cc delete mode 100644 branches/sage/pgs/crush/test/depth_variance.cc delete mode 100644 branches/sage/pgs/crush/test/mixed.cc delete mode 100644 branches/sage/pgs/crush/test/movement.cc delete mode 100644 branches/sage/pgs/crush/test/movement_failed.cc delete mode 100644 branches/sage/pgs/crush/test/overload.cc delete mode 100644 branches/sage/pgs/crush/test/overload_variance.cc delete mode 100644 branches/sage/pgs/crush/test/sizes.cc delete mode 100644 branches/sage/pgs/crush/test/smallbucket.cc delete mode 100644 branches/sage/pgs/crush/test/speed_bucket.cc delete mode 100644 branches/sage/pgs/crush/test/speed_depth.cc delete mode 100644 branches/sage/pgs/crush/test/speed_rush.cc delete mode 100644 branches/sage/pgs/crush/test/t.cc delete mode 100644 branches/sage/pgs/crush/test/testbucket.cc delete mode 100644 branches/sage/pgs/crush/test/testnormal.cc delete mode 100644 branches/sage/pgs/csyn.cc delete mode 100644 branches/sage/pgs/doc/Commitdir.txt delete mode 100644 branches/sage/pgs/doc/anchortable.txt delete mode 100644 branches/sage/pgs/doc/bdb.txt delete mode 100644 branches/sage/pgs/doc/caching.txt delete mode 100644 branches/sage/pgs/doc/dentries.txt delete mode 100644 branches/sage/pgs/doc/exports.txt delete mode 100644 branches/sage/pgs/doc/file_modes.txt delete mode 100644 branches/sage/pgs/doc/header.txt delete mode 100644 branches/sage/pgs/doc/inos.txt delete mode 100644 branches/sage/pgs/doc/journal.txt delete mode 100644 branches/sage/pgs/doc/lazy_posix.txt delete mode 100644 branches/sage/pgs/doc/mds_locks.txt delete mode 100644 branches/sage/pgs/doc/modeline.txt delete mode 100644 branches/sage/pgs/doc/osd_outline.txt delete mode 100644 branches/sage/pgs/doc/osd_replication.txt delete mode 100644 branches/sage/pgs/doc/shared_write_states_nogo.txt delete mode 100644 branches/sage/pgs/doc/shutdown.txt delete mode 100644 branches/sage/pgs/ebofs/Allocator.cc delete mode 100644 branches/sage/pgs/ebofs/Allocator.h delete mode 100644 branches/sage/pgs/ebofs/BlockDevice.cc delete mode 100644 branches/sage/pgs/ebofs/BlockDevice.h delete mode 100644 branches/sage/pgs/ebofs/BufferCache.cc delete mode 100644 branches/sage/pgs/ebofs/BufferCache.h delete mode 100644 branches/sage/pgs/ebofs/Cnode.h delete mode 100644 branches/sage/pgs/ebofs/Ebofs.cc delete mode 100644 branches/sage/pgs/ebofs/Ebofs.h delete mode 100644 branches/sage/pgs/ebofs/FileJournal.cc delete mode 100644 branches/sage/pgs/ebofs/FileJournal.h delete mode 100644 branches/sage/pgs/ebofs/Journal.h delete mode 100644 branches/sage/pgs/ebofs/Onode.h delete mode 100644 branches/sage/pgs/ebofs/Table.h delete mode 100644 branches/sage/pgs/ebofs/mkfs.ebofs.cc delete mode 100644 branches/sage/pgs/ebofs/nodes.h delete mode 100644 branches/sage/pgs/ebofs/test.ebofs.cc delete mode 100644 branches/sage/pgs/ebofs/types.h delete mode 100644 branches/sage/pgs/fakefuse.cc delete mode 100644 branches/sage/pgs/fakesyn.cc delete mode 100644 branches/sage/pgs/include/Context.h delete mode 100644 branches/sage/pgs/include/Distribution.h delete mode 100644 branches/sage/pgs/include/blobhash.h delete mode 100644 branches/sage/pgs/include/buffer.h delete mode 100644 branches/sage/pgs/include/encodable.h delete mode 100644 branches/sage/pgs/include/error.h delete mode 100644 branches/sage/pgs/include/filepath.h delete mode 100644 branches/sage/pgs/include/frag.h delete mode 100644 branches/sage/pgs/include/interval_set.h delete mode 100644 branches/sage/pgs/include/lru.h delete mode 100644 branches/sage/pgs/include/object.h delete mode 100644 branches/sage/pgs/include/oldbuffer.h delete mode 100644 branches/sage/pgs/include/oldbufferlist.h delete mode 100644 branches/sage/pgs/include/rangeset.h delete mode 100644 branches/sage/pgs/include/statlite.h delete mode 100644 branches/sage/pgs/include/types.h delete mode 100644 branches/sage/pgs/include/uofs.h delete mode 100644 branches/sage/pgs/include/utime.h delete mode 100644 branches/sage/pgs/jobs/alc.tp delete mode 100644 branches/sage/pgs/jobs/alcdat/makedirs delete mode 100644 branches/sage/pgs/jobs/alcdat/makedirs.big delete mode 100644 branches/sage/pgs/jobs/alcdat/makedirs.tput delete mode 100644 branches/sage/pgs/jobs/alcdat/makefiles.shared delete mode 100644 branches/sage/pgs/jobs/alcdat/openshared delete mode 100644 branches/sage/pgs/jobs/alcdat/ossh.include delete mode 100644 branches/sage/pgs/jobs/alcdat/ossh.include.big delete mode 100644 branches/sage/pgs/jobs/alcdat/ossh.lib delete mode 100644 branches/sage/pgs/jobs/alcdat/ossh.lib.big delete mode 100644 branches/sage/pgs/jobs/alcdat/striping delete mode 100644 branches/sage/pgs/jobs/example delete mode 100644 branches/sage/pgs/jobs/mds/log_striping delete mode 100644 branches/sage/pgs/jobs/mds/makedir_lat delete mode 100644 branches/sage/pgs/jobs/mds/makedirs delete mode 100644 branches/sage/pgs/jobs/mds/opensshlib delete mode 100644 branches/sage/pgs/jobs/meta1 delete mode 100755 branches/sage/pgs/jobs/meta1.proc.sh delete mode 100644 branches/sage/pgs/jobs/osd/ebofs delete mode 100644 branches/sage/pgs/jobs/osd/mds_log delete mode 100644 branches/sage/pgs/jobs/osd/osd_threads delete mode 100644 branches/sage/pgs/jobs/osd/striping delete mode 100644 branches/sage/pgs/jobs/osd/wr_lat2 delete mode 100644 branches/sage/pgs/jobs/osd/write_sizes delete mode 100644 branches/sage/pgs/jobs/rados/map_dist delete mode 100644 branches/sage/pgs/jobs/rados/rep_lat delete mode 100644 branches/sage/pgs/jobs/rados/wr_sizes delete mode 100644 branches/sage/pgs/mds/Anchor.h delete mode 100644 branches/sage/pgs/mds/AnchorClient.cc delete mode 100644 branches/sage/pgs/mds/AnchorClient.h delete mode 100644 branches/sage/pgs/mds/AnchorTable.cc delete mode 100644 branches/sage/pgs/mds/AnchorTable.h delete mode 100644 branches/sage/pgs/mds/CDentry.cc delete mode 100644 branches/sage/pgs/mds/CDentry.h delete mode 100644 branches/sage/pgs/mds/CDir.cc delete mode 100644 branches/sage/pgs/mds/CDir.h delete mode 100644 branches/sage/pgs/mds/CInode.cc delete mode 100644 branches/sage/pgs/mds/CInode.h delete mode 100644 branches/sage/pgs/mds/Capability.h delete mode 100644 branches/sage/pgs/mds/ClientMap.cc delete mode 100644 branches/sage/pgs/mds/ClientMap.h delete mode 100644 branches/sage/pgs/mds/FileLock.h delete mode 100644 branches/sage/pgs/mds/Hasher.cc delete mode 100644 branches/sage/pgs/mds/IdAllocator.cc delete mode 100644 branches/sage/pgs/mds/IdAllocator.h delete mode 100644 branches/sage/pgs/mds/LocalLock.h delete mode 100644 branches/sage/pgs/mds/Locker.cc delete mode 100644 branches/sage/pgs/mds/Locker.h delete mode 100644 branches/sage/pgs/mds/LogEvent.cc delete mode 100644 branches/sage/pgs/mds/LogEvent.h delete mode 100644 branches/sage/pgs/mds/MDBalancer.cc delete mode 100644 branches/sage/pgs/mds/MDBalancer.h delete mode 100644 branches/sage/pgs/mds/MDCache.cc delete mode 100644 branches/sage/pgs/mds/MDCache.h delete mode 100644 branches/sage/pgs/mds/MDLog.cc delete mode 100644 branches/sage/pgs/mds/MDLog.h delete mode 100644 branches/sage/pgs/mds/MDS.cc delete mode 100644 branches/sage/pgs/mds/MDS.h delete mode 100644 branches/sage/pgs/mds/MDSMap.h delete mode 100644 branches/sage/pgs/mds/Migrator.cc delete mode 100644 branches/sage/pgs/mds/Migrator.h delete mode 100644 branches/sage/pgs/mds/Renamer.cc delete mode 100644 branches/sage/pgs/mds/Renamer.h delete mode 100644 branches/sage/pgs/mds/ScatterLock.h delete mode 100644 branches/sage/pgs/mds/Server.cc delete mode 100644 branches/sage/pgs/mds/Server.h delete mode 100644 branches/sage/pgs/mds/SimpleLock.h delete mode 100644 branches/sage/pgs/mds/events/EAnchor.h delete mode 100644 branches/sage/pgs/mds/events/EAnchorClient.h delete mode 100644 branches/sage/pgs/mds/events/EExport.h delete mode 100644 branches/sage/pgs/mds/events/EImportFinish.h delete mode 100644 branches/sage/pgs/mds/events/EImportStart.h delete mode 100644 branches/sage/pgs/mds/events/EMetaBlob.h delete mode 100644 branches/sage/pgs/mds/events/EOpen.h delete mode 100644 branches/sage/pgs/mds/events/EPurgeFinish.h delete mode 100644 branches/sage/pgs/mds/events/ESession.h delete mode 100644 branches/sage/pgs/mds/events/ESlaveUpdate.h delete mode 100644 branches/sage/pgs/mds/events/EString.h delete mode 100644 branches/sage/pgs/mds/events/ESubtreeMap.h delete mode 100644 branches/sage/pgs/mds/events/EUpdate.h delete mode 100644 branches/sage/pgs/mds/journal.cc delete mode 100644 branches/sage/pgs/mds/mdstypes.h delete mode 100644 branches/sage/pgs/messages/MAnchor.h delete mode 100644 branches/sage/pgs/messages/MCacheExpire.h delete mode 100644 branches/sage/pgs/messages/MClientFileCaps.h delete mode 100644 branches/sage/pgs/messages/MClientMount.h delete mode 100644 branches/sage/pgs/messages/MClientReconnect.h delete mode 100644 branches/sage/pgs/messages/MClientReply.h delete mode 100644 branches/sage/pgs/messages/MClientRequest.h delete mode 100644 branches/sage/pgs/messages/MClientRequestForward.h delete mode 100644 branches/sage/pgs/messages/MClientSession.h delete mode 100644 branches/sage/pgs/messages/MClientUnmount.h delete mode 100644 branches/sage/pgs/messages/MDentryUnlink.h delete mode 100644 branches/sage/pgs/messages/MDirUpdate.h delete mode 100644 branches/sage/pgs/messages/MDiscover.h delete mode 100644 branches/sage/pgs/messages/MDiscoverReply.h delete mode 100644 branches/sage/pgs/messages/MExportDir.h delete mode 100644 branches/sage/pgs/messages/MExportDirAck.h delete mode 100644 branches/sage/pgs/messages/MExportDirCancel.h delete mode 100644 branches/sage/pgs/messages/MExportDirDiscover.h delete mode 100644 branches/sage/pgs/messages/MExportDirDiscoverAck.h delete mode 100644 branches/sage/pgs/messages/MExportDirFinish.h delete mode 100644 branches/sage/pgs/messages/MExportDirNotify.h delete mode 100644 branches/sage/pgs/messages/MExportDirNotifyAck.h delete mode 100644 branches/sage/pgs/messages/MExportDirPrep.h delete mode 100644 branches/sage/pgs/messages/MExportDirPrepAck.h delete mode 100644 branches/sage/pgs/messages/MExportDirWarning.h delete mode 100644 branches/sage/pgs/messages/MExportDirWarningAck.h delete mode 100644 branches/sage/pgs/messages/MGenericMessage.h delete mode 100644 branches/sage/pgs/messages/MHeartbeat.h delete mode 100644 branches/sage/pgs/messages/MInodeFileCaps.h delete mode 100644 branches/sage/pgs/messages/MLock.h delete mode 100644 branches/sage/pgs/messages/MMDSBeacon.h delete mode 100644 branches/sage/pgs/messages/MMDSBoot.h delete mode 100644 branches/sage/pgs/messages/MMDSCacheRejoin.h delete mode 100644 branches/sage/pgs/messages/MMDSGetMap.h delete mode 100644 branches/sage/pgs/messages/MMDSMap.h delete mode 100644 branches/sage/pgs/messages/MMDSResolve.h delete mode 100644 branches/sage/pgs/messages/MMDSResolveAck.h delete mode 100644 branches/sage/pgs/messages/MMDSSlaveRequest.h delete mode 100644 branches/sage/pgs/messages/MMonCommand.h delete mode 100644 branches/sage/pgs/messages/MMonCommandAck.h delete mode 100644 branches/sage/pgs/messages/MMonElection.h delete mode 100644 branches/sage/pgs/messages/MMonElectionCollect.h delete mode 100644 branches/sage/pgs/messages/MMonElectionRefresh.h delete mode 100644 branches/sage/pgs/messages/MMonElectionStatus.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapInfo.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapLease.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapLeaseAck.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapUpdateAck.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapUpdateCommit.h delete mode 100644 branches/sage/pgs/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 branches/sage/pgs/messages/MMonPaxos.h delete mode 100644 branches/sage/pgs/messages/MOSDBoot.h delete mode 100644 branches/sage/pgs/messages/MOSDFailure.h delete mode 100644 branches/sage/pgs/messages/MOSDGetMap.h delete mode 100644 branches/sage/pgs/messages/MOSDIn.h delete mode 100644 branches/sage/pgs/messages/MOSDMap.h delete mode 100644 branches/sage/pgs/messages/MOSDOp.h delete mode 100644 branches/sage/pgs/messages/MOSDOpReply.h delete mode 100644 branches/sage/pgs/messages/MOSDOut.h delete mode 100644 branches/sage/pgs/messages/MOSDPGLog.h delete mode 100644 branches/sage/pgs/messages/MOSDPGNotify.h delete mode 100644 branches/sage/pgs/messages/MOSDPGPeer.h delete mode 100644 branches/sage/pgs/messages/MOSDPGPeerAck.h delete mode 100644 branches/sage/pgs/messages/MOSDPGPeerRequest.h delete mode 100644 branches/sage/pgs/messages/MOSDPGQuery.h delete mode 100644 branches/sage/pgs/messages/MOSDPGRemove.h delete mode 100644 branches/sage/pgs/messages/MOSDPGSummary.h delete mode 100644 branches/sage/pgs/messages/MOSDPGUpdate.h delete mode 100644 branches/sage/pgs/messages/MOSDPing.h delete mode 100644 branches/sage/pgs/messages/MPing.h delete mode 100644 branches/sage/pgs/messages/MPingAck.h delete mode 100644 branches/sage/pgs/mkmonmap.cc delete mode 100644 branches/sage/pgs/mon/ClientMonitor.cc delete mode 100644 branches/sage/pgs/mon/ClientMonitor.h delete mode 100644 branches/sage/pgs/mon/Elector.cc delete mode 100644 branches/sage/pgs/mon/Elector.h delete mode 100644 branches/sage/pgs/mon/MDSMonitor.cc delete mode 100644 branches/sage/pgs/mon/MDSMonitor.h delete mode 100644 branches/sage/pgs/mon/MonMap.h delete mode 100644 branches/sage/pgs/mon/Monitor.cc delete mode 100644 branches/sage/pgs/mon/Monitor.h delete mode 100644 branches/sage/pgs/mon/MonitorStore.cc delete mode 100644 branches/sage/pgs/mon/MonitorStore.h delete mode 100644 branches/sage/pgs/mon/OSDMonitor.cc delete mode 100644 branches/sage/pgs/mon/OSDMonitor.h delete mode 100644 branches/sage/pgs/mon/PGMap.h delete mode 100644 branches/sage/pgs/mon/PGMonitor.cc delete mode 100644 branches/sage/pgs/mon/PGMonitor.h delete mode 100644 branches/sage/pgs/mon/Paxos.cc delete mode 100644 branches/sage/pgs/mon/Paxos.h delete mode 100644 branches/sage/pgs/mon/PaxosService.cc delete mode 100644 branches/sage/pgs/mon/PaxosService.h delete mode 100644 branches/sage/pgs/mon/mon_types.h delete mode 100644 branches/sage/pgs/msg/Dispatcher.cc delete mode 100644 branches/sage/pgs/msg/Dispatcher.h delete mode 100644 branches/sage/pgs/msg/FakeMessenger.cc delete mode 100644 branches/sage/pgs/msg/FakeMessenger.h delete mode 100644 branches/sage/pgs/msg/HostMonitor.cc delete mode 100644 branches/sage/pgs/msg/HostMonitor.h delete mode 100644 branches/sage/pgs/msg/Message.cc delete mode 100644 branches/sage/pgs/msg/Message.h delete mode 100644 branches/sage/pgs/msg/Messenger.cc delete mode 100644 branches/sage/pgs/msg/Messenger.h delete mode 100644 branches/sage/pgs/msg/RWLock.h delete mode 100644 branches/sage/pgs/msg/SerialMessenger.h delete mode 100644 branches/sage/pgs/msg/SimpleMessenger.cc delete mode 100644 branches/sage/pgs/msg/SimpleMessenger.h delete mode 100644 branches/sage/pgs/msg/mpistarter.cc delete mode 100644 branches/sage/pgs/msg/msg_types.h delete mode 100644 branches/sage/pgs/msg/new_mpistarter.cc delete mode 100644 branches/sage/pgs/msg/tcp.cc delete mode 100644 branches/sage/pgs/msg/tcp.h delete mode 100644 branches/sage/pgs/newsyn.cc delete mode 100644 branches/sage/pgs/osbdb/OSBDB.cc delete mode 100644 branches/sage/pgs/osbdb/OSBDB.h delete mode 100644 branches/sage/pgs/osd/Ager.cc delete mode 100644 branches/sage/pgs/osd/Ager.h delete mode 100644 branches/sage/pgs/osd/BDBMap.h delete mode 100644 branches/sage/pgs/osd/Fake.h delete mode 100644 branches/sage/pgs/osd/FakeStore.cc delete mode 100644 branches/sage/pgs/osd/FakeStore.h delete mode 100644 branches/sage/pgs/osd/FakeStoreBDBCollections.h delete mode 100644 branches/sage/pgs/osd/OBFSStore.cc delete mode 100644 branches/sage/pgs/osd/OBFSStore.h delete mode 100644 branches/sage/pgs/osd/OSD.cc delete mode 100644 branches/sage/pgs/osd/OSD.h delete mode 100644 branches/sage/pgs/osd/OSDMap.h delete mode 100644 branches/sage/pgs/osd/ObjectStore.cc delete mode 100644 branches/sage/pgs/osd/ObjectStore.h delete mode 100644 branches/sage/pgs/osd/PG.cc delete mode 100644 branches/sage/pgs/osd/PG.h delete mode 100644 branches/sage/pgs/osd/RAID4PG.cc delete mode 100644 branches/sage/pgs/osd/RAID4PG.h delete mode 100644 branches/sage/pgs/osd/ReplicatedPG.cc delete mode 100644 branches/sage/pgs/osd/ReplicatedPG.h delete mode 100644 branches/sage/pgs/osd/osd_types.h delete mode 100644 branches/sage/pgs/osd/rush.cc delete mode 100644 branches/sage/pgs/osd/rush.h delete mode 100644 branches/sage/pgs/osd/tp.cc delete mode 100644 branches/sage/pgs/osdc/Blinker.h delete mode 100644 branches/sage/pgs/osdc/Filer.cc delete mode 100644 branches/sage/pgs/osdc/Filer.h delete mode 100644 branches/sage/pgs/osdc/Journaler.cc delete mode 100644 branches/sage/pgs/osdc/Journaler.h delete mode 100644 branches/sage/pgs/osdc/ObjectCacher.cc delete mode 100644 branches/sage/pgs/osdc/ObjectCacher.h delete mode 100644 branches/sage/pgs/osdc/Objecter.cc delete mode 100644 branches/sage/pgs/osdc/Objecter.h delete mode 100755 branches/sage/pgs/script/add_header.pl delete mode 100755 branches/sage/pgs/script/adjusttabs.pl delete mode 100755 branches/sage/pgs/script/check_cache_dumps.pl delete mode 100755 branches/sage/pgs/script/clean_osd_cow.sh delete mode 100755 branches/sage/pgs/script/clean_trace.pl delete mode 100755 branches/sage/pgs/script/comb.pl delete mode 100755 branches/sage/pgs/script/find_auth_pins.pl delete mode 100755 branches/sage/pgs/script/find_bufferleaks.pl delete mode 100755 branches/sage/pgs/script/find_lost_bdev_ops.pl delete mode 100755 branches/sage/pgs/script/find_lost_commit.pl delete mode 100755 branches/sage/pgs/script/find_lost_objecter.pl delete mode 100755 branches/sage/pgs/script/find_pathpins.pl delete mode 100755 branches/sage/pgs/script/find_requests.pl delete mode 100755 branches/sage/pgs/script/find_waiters.pl delete mode 100755 branches/sage/pgs/script/fix_modeline.pl delete mode 100755 branches/sage/pgs/script/grepblock delete mode 100644 branches/sage/pgs/script/merge_trace_rw.pl delete mode 100755 branches/sage/pgs/script/profonly.pl delete mode 100755 branches/sage/pgs/script/runset.pl delete mode 100755 branches/sage/pgs/script/sum.pl delete mode 100644 branches/sage/pgs/test/fakemds.cc delete mode 100644 branches/sage/pgs/test/gprof-helper.c delete mode 100644 branches/sage/pgs/test/makedirs.cc delete mode 100644 branches/sage/pgs/test/mpitest.cc delete mode 100644 branches/sage/pgs/test/mttest.cc delete mode 100644 branches/sage/pgs/test/rushconfig delete mode 100644 branches/sage/pgs/test/rushtest.cc delete mode 100644 branches/sage/pgs/test/rushtest.cc~ delete mode 100644 branches/sage/pgs/test/testbucket.cc delete mode 100644 branches/sage/pgs/test/testbuffers.cc delete mode 100644 branches/sage/pgs/test/testcrush.cc delete mode 100644 branches/sage/pgs/test/testfilepath.cc delete mode 100644 branches/sage/pgs/test/testmpi.cc delete mode 100644 branches/sage/pgs/test/testnewbuffers.cc delete mode 100644 branches/sage/pgs/test/testos.cc delete mode 100644 branches/sage/pgs/test/testosbdb.cc delete mode 100644 branches/sage/pgs/test/testtree.cc delete mode 100644 branches/sage/pgs/test/testxattr.cc delete mode 100644 branches/sage/pgs/valgrind.supp diff --git a/branches/sage/pgs/COPYING b/branches/sage/pgs/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/branches/sage/pgs/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/branches/sage/pgs/Makefile b/branches/sage/pgs/Makefile deleted file mode 100644 index f9d8a4843171a..0000000000000 --- a/branches/sage/pgs/Makefile +++ /dev/null @@ -1,275 +0,0 @@ - -# mpicxx must be on your path to build newsyn. -# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. -# on issdm, it's /usr/local/mpich2/bin. - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = - -ifeq ($(target),darwin) -# For Darwin -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} -LDINC = ar -rc -else -# For linux -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -LDINC = ld -i -o -endif - -CC = g++ -LIBS = -lpthread - -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -OSBDB_LIBS = -ldb_cxx -endif - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o\ - ebofs/FileJournal.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/ClientMap.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/ReplicatedPG.o\ - osd/RAID4PG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/PaxosService.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/PGMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - - -ifeq ($(want_bdb),yes) -OSBDB_OBJS = \ - osbdb/OSBDB.o - -OSBDB_OBJ = osbdb.o -endif - -TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cmonctl cfuse fakefuse -NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap - - -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -all: depend ${TARGETS} - -nofuse: depend ${NO_FUSE} - -test: depend ${TEST_TARGETS} - -obfs: depend obfstest - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - -cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - -activemaster: active/activemaster.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -activeslave: active/activeslave.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -echotestclient: active/echotestclient.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -msgtestclient: active/msgtestclient.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - - -# fake* -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - -newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - - -# + obfs (old) -fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o - ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ - -tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a - - -# hadoop -libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@ - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -# OSD test - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - -# - -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null - -# now add a line to include the dependency list. -include .depend diff --git a/branches/sage/pgs/README b/branches/sage/pgs/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/branches/sage/pgs/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/branches/sage/pgs/TODO b/branches/sage/pgs/TODO deleted file mode 100644 index 3e760361151b4..0000000000000 --- a/branches/sage/pgs/TODO +++ /dev/null @@ -1,339 +0,0 @@ - - -- change same_inst_since to align with "in" set -- tag MClientRequest with mdsmap v -- push new mdsmap to clients on send_message_client, based on the tag? - - hrm, what about exports and stale caps wonkiness... there's a race with the REAP. hmm. - - -some smallish projects: - -- crush rewrite in C - - generalize any memory management etc. to allow use in kernel and userspace -- userspace crush tools - - xml import/export? - - ? - -- pg monitor service - - to support statfs? - - general pg health - - some sort of (throttled) osd status reporting - - dynamic pg creation (eventually!) - -- SimpleMessenger - - clean up/merge Messenger/Dispatcher interfaces - - auto close idle connections - - delivery ack and buffering, and then reconnect - - take a look at RDS? http://oss.oracle.com/projects/rds/ - -- generalize monitor client? - - throttle message resend attempts - -- ENOSPC on client, OSD - - - -code cleanup -- endian portability -- word size - - clean up all encoded structures - -general kernel planning -- soft consistency on (kernel) lookup? -- accurate reconstruction of (syscall) path? - -software raid layer for EBOFS? -- actually, we just need software raid _awareness_ in the allocator, so - that we can write only full stripes, without fear of clobbering things on - failure. then use MD or similar layer provided by kernel. - - -sage doc -- mdsmonitor beacon semantics -- cache expiration, cache invariants - - including dual expire states, transition, vs subtree grouping of expire messages -- recovery states, implicit barrier are rejoin -- journal content - - importmaps and up:resolve -- metablob version semantics - - - -sage mds - -- hmm, should we move ESubtreeMap out of the journal? - that would avoid all the icky weirdness in shutdown, with periodic logging, etc. - -- extend/clean up filepath to allow paths relative to an ino - - fix path_traverse - - fix reconnect/rejoin open file weirdness - -- stray reintegration -- stray purge on shutdown - - need to export stray crap to another mds.. -- verify stray is empty on shutdown - -- dirfrag split/merge - - client readdir for dirfrags -- consistency points/snapshots - - dentry versions vs dirfrags... -- statfs? - -- more testing of failures + thrashing. - - is export prep dir open deadlock properly fixed by forge_replica_dir()? -- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - -- dirfrag split - - make sure we are freezing _before_ we fetch to complete the dirfrag, else - we break commit()'s preconditions when it fetches an incomplete dir. - -- detect and deal with client failure - - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - -- inode.max_size -- inode.allocated_size - -- real chdir (directory "open") - - relative metadata ops - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) - -- fix rmdir empty exported dirfrag race - - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. - - how to know full dir size (when trimming)? - - put frag size/mtime in fragmap in inode? we will need that anyway for stat on dirs - - will need to make inode discover/import_decode smart about dirfrag auth - - or, only put frag size/mtime in inode when frag is closed. otherwise, soft (journaled) state, possibly on another mds. - - need to move state from replicas to auth. simplelock doesn't currently support that. - - ScatterLock or something? hrm. - -- FIXME how to journal root and stray inode content? - - in particular, i care about dirfragtree.. get it on rejoin? - - and dir sizes, if i add that... also on rejoin? - - - -osdmon -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados+ebofs -- purge replicated writes from cache. (with exception of partial tail blocks.) - -rados paper todo? -- better experiments - - berkeleydb objectstore? -- flush log only in response to subsequent read or write? -- better behaving recovery -- justify use of splay. - - dynamic replication -- snapshots - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- clean up mds caps release in exporter -- figure out client failure modes -- add connection retry. - - -objecter -- transaction prepare/commit -- read+floor_lockout - -osd/rados -- transaction prepare/commit - - rollback - - rollback logging (to fix slow prepare vs rollback race) -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. -- efficiently replicate clone() objects -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? -- consider implications of nvram writeahead logs -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- pg_num changes -- report crashed pgs? - -messenger -- fix messenger shutdown.. we shouldn't delete messenger, since the caller may be referencing it, etc. - -simplemessenger -- close idle connections -- buffer sent messages until a receive is acknowledged (handshake!) - - retry, timeout on connection or transmission failure -- exponential backoff on monitor resend attempts (actually, this should go outside the messenger!) - -objectcacher -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- verify proper behavior of conflicting/overlapping reads of clones -- test(fix) sync() -- combine inodes and/or cnodes into same blocks -- allow btree sets instead of maps -- eliminate nodepools -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) -- journaling? in NVRAM? -- metadata in nvram? flash? - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - -mds -- distributed client management -- anchormgr - - 2pc - - independent journal? - - distributed? -- link count management - - also 2pc -- chdir (directory opens!) -- rewrite logstream - - clean up - - be smart about rados ack vs reread - - log locking? root log object - - trimming, rotation - -- efficient stat for single writers -- lstat vs stat -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) -- finish hard links! - - reclaim danglers from inode file on discover... - - fix rename wrt hard links -- interactive hash/unhash interface -- test hashed readdir -- make logstream.flush align itself to stripes - -- carefully define/document frozen wrt dir_auth vs hashing - - - -client -- fstat -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. -- test client caps migration w/ mds exports -- some heuristic behavior to consolidate caps to inode auth? - - - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -SAGE: - -- string table? - -- hard links - - fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already!) - -- do real permission checks? - - - - - - -ISSUES - - -- discover - - soft: authority selectively repicates, or sets a 'forward' flag in reply - - hard: authority always replicates (eg. discover for export) - - forward flag (see soft) - - error flag (if file not found, etc.) - - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply - - - -DOCUMENT -- cache, distributed cache structure and invariants -- export process -- hash/unhash process - - -TEST -- hashing - - test hash/unhash operation - - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed. - - test nauthitems (wrt hashing?) - - -IMPLEMENT - -- smarter balancing - - popularity calculation and management is inconsistent/wrong. - - does it work? - -- dump active config in run output somewhere - - - - - - diff --git a/branches/sage/pgs/cfuse.cc b/branches/sage/pgs/cfuse.cc deleted file mode 100644 index 3540e1b2a14e8..0000000000000 --- a/branches/sage/pgs/cfuse.cc +++ /dev/null @@ -1,84 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << endl; - client->mount(); - - cerr << "starting fuse on pid " << getpid() << endl; - ceph_fuse_main(client, argc, argv); - cerr << "fuse finished on pid " << getpid() << endl; - - client->unmount(); - cout << "unmounted" << endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/pgs/client/Client.cc b/branches/sage/pgs/client/Client.cc deleted file mode 100644 index c07253ee6c763..0000000000000 --- a/branches/sage/pgs/client/Client.cc +++ /dev/null @@ -1,3079 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " " - -#define tout if (g_conf.client_trace) cout << "trace: " - - -// static logger -LogType client_logtype; -Logger *client_logger = 0; - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm) : timer(client_lock) -{ - // which client am i? - whoami = m->get_myname().num(); - monmap = mm; - - mounted = false; - mount_timeout_event = 0; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - set_cache_size(g_conf.client_cache_size); - - // file handles - free_fh_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - messenger->set_dispatcher(this); - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap); - objecter->set_client_incarnation(0); // client always 0, for now. - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fh's - for (hash_map::iterator it = fh_map.begin(); - it != fh_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl; - put_inode(fh->inode); - delete fh; - } - fh_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << endl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << endl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << endl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << endl; - } - } - -} - - -void Client::init() { - -} - -void Client::shutdown() { - dout(1) << "shutdown" << endl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << endl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << endl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << endl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " not linked or linked at the right position, relinking" - << endl; - dn = relink(dir, dname, in); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << endl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << endl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->inode.mask << endl; - dn->inode->inode = st->inode; - - // ...but don't clobber our mtime, size! - if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // OK, we found it! - assert(dn && dn->inode); - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_inode_dist(Inode *in, InodeStat *st) -{ - // auth - in->dir_auth = -1; - if (!st->dirfrag_auth.empty()) { // HACK FIXME ******* FIXME FIXME FIXME FIXME dirfrag_t - in->dir_auth = st->dirfrag_auth.begin()->second; - } - - // replicated - in->dir_replicated = false; - if (!st->dirfrag_rep.empty()) - in->dir_replicated = true; // FIXME - - // dist - if (!st->dirfrag_dist.empty()) { // FIXME - set dist = st->dirfrag_dist.begin()->second; - if (dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << dist << endl; - if (!dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << dist << endl; - in->dir_contacts = dist; - } -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - utime_t now = g_clock.real_now(); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << endl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - inode_map[root->inode.ino] = root; - } - } else { - // not root. - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl; - Dir *dir = cur->open_dir(); - cur = this->insert_inode(dir, *pin, *pdn); - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // update dist info - update_inode_dist(cur, *pin); - - // set cache ttl - if (g_conf.client_cache_stat_ttl) { - cur->valid_until = now; - cur->valid_until += g_conf.client_cache_stat_ttl; - } - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << endl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << endl; - } else { - dout(14) << " dentry " << path[i] << " dne" << endl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl; - } - - return dn; -} - -// ------- - -int Client::choose_target_mds(MClientRequest *req) -{ - int mds = 0; - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - for (unsigned i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // pick mds - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; - - if (0) { - mds = 0; - dout(0) << "hack: sending all requests to mds" << mds << endl; - } - } else { - if (req->auth_is_best()) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(mdsmap); - } else if (diri->dir_hashed && missing_dn >= 0) { - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - } else { - mds = diri->authority(mdsmap); - } - } else { - // balance our traffic! - if (diri->dir_hashed && missing_dn >= 0) - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - else - mds = diri->pick_replica(mdsmap); - } - } - dout(20) << "mds is " << mds << endl; - - return mds; -} - - - -MClientReply *Client::make_request(MClientRequest *req, - int use_mds) // this param is purely for debug hacking -{ - // time the call - utime_t start = g_clock.real_now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN) - nojournal = true; - - - // -- request -- - // assign a unique tid - tid_t tid = ++last_tid; - req->set_tid(tid); - - if (!mds_requests.empty()) - req->set_oldest_client_tid(mds_requests.begin()->first); - else - req->set_oldest_client_tid(tid); // this one is the oldest. - - // make note - MetaRequest request(req, tid); - mds_requests[tid] = &request; - - // encode payload now, in case we have to resend (in case of mds failure) - req->encode_payload(); - request.request_payload = req->get_payload(); - - // note idempotency - request.idempotent = req->is_idempotent(); - - // hack target mds? - if (use_mds) - request.resend_mds = use_mds; - - // set up wait cond - Cond cond; - request.caller_cond = &cond; - - while (1) { - // choose mds - int mds; - // force use of a particular mds? - if (request.resend_mds >= 0) { - mds = request.resend_mds; - request.resend_mds = -1; - dout(10) << "target resend_mds specified as mds" << mds << endl; - } else { - mds = choose_target_mds(req); - dout(10) << "chose target mds" << mds << " based on hierarchy" << endl; - } - - // open a session? - if (mds_sessions.count(mds) == 0) { - Cond cond; - - if (waiting_for_session.count(mds) == 0) { - dout(10) << "opening session to mds" << mds << endl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_OPEN), - mdsmap->get_inst(mds), MDS_PORT_SERVER); - } - - // wait - waiting_for_session[mds].push_back(&cond); - while (waiting_for_session.count(mds)) { - dout(10) << "waiting for session to mds" << mds << " to open" << endl; - cond.Wait(client_lock); - } - } - - // send request. - send_request(&request, mds); - - // wait for signal - dout(20) << "awaiting kick on " << &cond << endl; - cond.Wait(client_lock); - - // did we get a reply? - if (request.reply) - break; - } - - // got it! - MClientReply *reply = request.reply; - - // kick dispatcher (we've got it!) - assert(request.dispatch_cond); - request.dispatch_cond->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << request.dispatch_cond << endl; - - // clean up. - mds_requests.erase(tid); - - - // -- log times -- - if (client_logger) { - utime_t lat = g_clock.real_now(); - lat -= start; - dout(20) << "lat " << lat << endl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -void Client::handle_client_session(MClientSession *m) -{ - dout(10) << "handle_client_session " << *m << endl; - int from = m->get_source().num(); - - switch (m->op) { - case MClientSession::OP_OPEN: - assert(mds_sessions.count(from) == 0); - mds_sessions[from] = 0; - break; - - case MClientSession::OP_CLOSE: - mds_sessions.erase(from); - // FIXME: kick requests (hard) so that they are redirected. or fail. - break; - - default: - assert(0); - } - - // kick waiting threads - for (list::iterator p = waiting_for_session[from].begin(); - p != waiting_for_session[from].end(); - ++p) - (*p)->Signal(); - waiting_for_session.erase(from); - - delete m; -} - - -void Client::send_request(MetaRequest *request, int mds) -{ - MClientRequest *r = request->request; - if (!r) { - // make a new one - dout(10) << "send_request rebuilding request " << request->tid - << " for mds" << mds << endl; - r = new MClientRequest; - r->copy_payload(request->request_payload); - r->decode_payload(); - r->set_retry_attempt(request->retry_attempt); - } - request->request = 0; - - dout(10) << "send_request " << *r << " to mds" << mds << endl; - messenger->send_message(r, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - request->mds.insert(mds); -} - -void Client::handle_client_request_forward(MClientRequestForward *fwd) -{ - tid_t tid = fwd->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_request_forward no pending request on tid " << tid << endl; - delete fwd; - return; - } - - MetaRequest *request = mds_requests[tid]; - assert(request); - - // reset retry counter - request->retry_attempt = 0; - - if (request->idempotent && - mds_sessions.count(fwd->get_dest_mds())) { - // dest mds has a session, and request was forwarded for us. - - // note new mds set. - if (request->num_fwd < fwd->get_num_fwd()) { - // there are now exactly two mds's whose failure should trigger a resend - // of this request. - request->mds.clear(); - request->mds.insert(fwd->get_source().num()); - request->mds.insert(fwd->get_dest_mds()); - request->num_fwd = fwd->get_num_fwd(); - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", mds set now " << request->mds - << endl; - } else { - dout(10) << "handle_client_request tid " << tid - << " previously forwarded to mds" << fwd->get_dest_mds() - << ", mds still " << request->mds - << endl; - } - } else { - // request not forwarded, or dest mds has no session. - // resend. - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", non-idempotent, resending to " << fwd->get_dest_mds() - << endl; - - request->mds.clear(); - request->num_fwd = fwd->get_num_fwd(); - request->resend_mds = fwd->get_dest_mds(); - request->caller_cond->Signal(); - } - - delete fwd; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_reply no pending request on tid " << tid << endl; - delete reply; - return; - } - MetaRequest *request = mds_requests[tid]; - assert(request); - - // store reply - request->reply = reply; - - // wake up waiter - request->caller_cond->Signal(); - - // wake for kick back - Cond cond; - request->dispatch_cond = &cond; - while (mds_requests.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; - cond.Wait(client_lock); - } -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - mount_cond.Signal(); - break; - - // mounting and mds sessions - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount(m); - break; - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - break; - - // requests - case MSG_CLIENT_REQUEST_FORWARD: - handle_client_request_forward((MClientRequestForward*)m); - break; - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - - - default: - cout << "dispatch doesn't recognize message type " << m->get_type() << endl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - int frommds = -1; - if (m->get_source().is_mds()) - frommds = m->get_source().num(); - - if (mdsmap == 0) { - mdsmap = new MDSMap; - - assert(m->get_source().is_mon()); - whoami = m->get_dest().num(); - dout(1) << "handle_mds_map i am now " << m->get_dest() << endl; - messenger->reset_myname(m->get_dest()); - - mount_cond.Signal(); // mount might be waiting for this. - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl; - epoch_t was = mdsmap->get_epoch(); - mdsmap->decode(m->get_encoded()); - assert(mdsmap->get_epoch() >= was); - - // send reconnect? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_RECONNECT) { - send_reconnect(frommds); - } - - // kick requests? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_ACTIVE) { - kick_requests(frommds); - //failed_mds.erase(from); - } - - delete m; -} - -void Client::send_reconnect(int mds) -{ - dout(10) << "send_reconnect to mds" << mds << endl; - - MClientReconnect *m = new MClientReconnect; - - if (mds_sessions.count(mds)) { - // i have an open session. - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - if (p->second->caps.count(mds)) { - dout(10) << " caps on " << p->first - << " " << cap_string(p->second->caps[mds].caps) - << " wants " << cap_string(p->second->file_caps_wanted()) - << endl; - p->second->caps[mds].seq = 0; // reset seq. - m->add_inode_caps(p->first, // ino - p->second->file_caps_wanted(), // wanted - p->second->caps[mds].caps, // issued - p->second->inode.size, p->second->inode.mtime, p->second->inode.atime); - string path; - p->second->make_path(path); - dout(10) << " path on " << p->first << " is " << path << endl; - m->add_inode_path(p->first, path); - } - if (p->second->stale_caps.count(mds)) { - dout(10) << " clearing stale caps on " << p->first << endl; - p->second->stale_caps.erase(mds); // hrm, is this right? - } - } - - // reset my cap seq number - mds_sessions[mds] = 0; - } else { - dout(10) << " i had no session with this mds"; - m->closed = true; - } - - messenger->send_message(m, mdsmap->get_inst(mds), MDS_PORT_SERVER); -} - - -void Client::kick_requests(int mds) -{ - dout(10) << "kick_requests for mds" << mds << endl; - - for (map::iterator p = mds_requests.begin(); - p != mds_requests.end(); - ++p) - if (p->second->mds.count(mds)) { - p->second->retry_attempt++; // inc retry counter - send_request(p->second, mds); - } -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // note push seq increment - assert(mds_sessions.count(mds)); - mds_sessions[mds]++; - - // reap? - if (m->get_op() == MClientFileCaps::OP_REAP) { - int other = m->get_mds(); - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_op() == MClientFileCaps::OP_STALE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_op() == MClientFileCaps::OP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << endl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << endl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << endl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << endl; - - // did file size decrease? - if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 && - (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 && - in->inode.size > m->get_inode().size) { - dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << endl; - - // trim filecache? - if (g_conf.client_oc) - in->fc.truncate(in->inode.size, m->get_inode().size); - - in->inode.size = in->file_wr_size = m->get_inode().size; - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << endl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << endl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << endl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << endl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, - mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } -} - - - -// ------------------- -// MOUNT - -void Client::_try_mount() -{ - dout(10) << "_try_mount" << endl; - int mon = monmap->pick_mon(); - dout(2) << "sending client_mount to mon" << mon << endl; - messenger->send_message(new MClientMount(messenger->get_myaddr()), - monmap->get_inst(mon)); - - // schedule timeout - assert(mount_timeout_event == 0); - mount_timeout_event = new C_MountTimeout(this); - timer.add_event_after(g_conf.client_mount_timeout, mount_timeout_event); -} - -void Client::_mount_timeout() -{ - dout(10) << "_mount_timeout" << endl; - mount_timeout_event = 0; - _try_mount(); -} - -int Client::mount() -{ - client_lock.Lock(); - assert(!mounted); // caller is confused? - assert(!mdsmap); - - _try_mount(); - - while (!mdsmap || - !osdmap || - osdmap->get_epoch() == 0) - mount_cond.Wait(client_lock); - - timer.cancel_event(mount_timeout_event); - mount_timeout_event = 0; - - mounted = true; - - dout(2) << "mounted: have osdmap " << osdmap->get_epoch() - << " and mdsmap " << mdsmap->get_epoch() - << endl; - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << endl; - dout(3) << "op: struct stat st;" << endl; - dout(3) << "op: struct utimbuf utim;" << endl; - dout(3) << "op: int readlinkbuf_len = 1000;" << endl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; - dout(3) << "op: map dir_contents;" << endl; - dout(3) << "op: map open_files;" << endl; - dout(3) << "op: fh_t fh;" << endl; - */ - return 0; -} - - -// UNMOUNT - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << endl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fh_map.empty()); - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl; - release_caps(in); - } - } - } - } - - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (presumably for safe or for caps to be released?)" - << endl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << endl; - mount_cond.Wait(client_lock); - } - } - - // send session closes! - for (map::iterator p = mds_sessions.begin(); - p != mds_sessions.end(); - ++p) { - dout(2) << "sending client_session close to mds" << p->first << " seq " << p->second << endl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_CLOSE, - p->second), - mdsmap->get_inst(p->first), MDS_PORT_SERVER); - } - - // send unmount! - int mon = monmap->pick_mon(); - dout(2) << "sending client_unmount to mon" << mon << endl; - messenger->send_message(new MClientUnmount(messenger->get_myinst()), - monmap->get_inst(mon)); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted." << endl; - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount(Message* m) -{ - dout(1) << "handle_unmount got ack" << endl; - - mounted = false; - - delete mdsmap; - mdsmap = 0; - - mount_cond.Signal(); - - delete m; -} - - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - client_lock.Lock(); - dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl; - tout << "link" << endl; - tout << existing << endl; - tout << newname << endl; - - - // main path arg is new link name - // sarg is target (existing file) - - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); - req->set_path(newname); - req->set_sarg(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::unlink(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->unlink\(\"" << path << "\");" << endl; - tout << "unlink" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - client_lock.Lock(); - - string absfrom; - mkabspath(relfrom, absfrom); - const char *from = absfrom.c_str(); - string absto; - mkabspath(relto, absto); - const char *to = absto.c_str(); - - dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl; - tout << "rename" << endl; - tout << from << endl; - tout << to << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); - req->set_path(from); - req->set_sarg(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl; - tout << "mkdir" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); - req->set_path(path); - req->args.mkdir.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rmdir(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->rmdir(\"" << path << "\");" << endl; - tout << "rmdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rmdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// symlinks - -int Client::symlink(const char *reltarget, const char *rellink) -{ - client_lock.Lock(); - - string abstarget; - mkabspath(reltarget, abstarget); - const char *target = abstarget.c_str(); - string abslink; - mkabspath(rellink, abslink); - const char *link = abslink.c_str(); - - dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl; - tout << "symlink" << endl; - tout << target << endl; - tout << link << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); - req->set_path(link); - req->set_sarg(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - dout(10) << "symlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::readlink(const char *relpath, char *buf, off_t size) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; - tout << "readlink" << endl; - tout << path << endl; - client_lock.Unlock(); - - // stat first (FIXME, PERF access cache directly) **** - struct stat stbuf; - int r = this->lstat(path, &stbuf); - if (r != 0) return r; - - client_lock.Lock(); - - // pull symlink content from cache - Inode *in = inode_map[stbuf.st_ino]; - assert(in); // i just did a stat - - // copy into buf (at most size bytes) - unsigned res = in->symlink->length(); - if (res > size) res = size; - memcpy(buf, in->symlink->c_str(), res); - - trim_cache(); - client_lock.Unlock(); - return res; // return length in bytes (to mimic the system call) -} - - - -// inode stuff - -int Client::_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - utime_t now = g_clock.real_now(); - if (dn && - now <= dn->inode->valid_until && - ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, messenger->get_myinst()); - req->args.stat.mask = mask; - req->set_path(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << endl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -void Client::fill_stat(inode_t& inode, struct stat *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; - st->st_ctime = MAX(inode.ctime, inode.mtime); - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; -} - -void Client::fill_statlite(inode_t& inode, struct statlite *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; -#ifndef DARWIN - // FIXME what's going on here with darwin? - st->st_ctime = MAX(inode.ctime, inode.mtime); - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; -#endif - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ -} - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl; - tout << "lstat" << endl; - tout << path << endl; - - Inode *in = 0; - - int res = _lstat(path, INODE_MASK_ALL_STAT, &in); - if (res == 0) { - assert(in); - fill_stat(in->inode,stbuf); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl; - tout << "lstatlite" << endl; - tout << path << endl; - - // make mask - // FIXME. - int mask = INODE_MASK_BASE | INODE_MASK_AUTH; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::chmod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl; - tout << "chmod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); - req->set_path(path); - req->args.chmod.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chmod result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl; - tout << "chown" << endl; - tout << path << endl; - tout << uid << endl; - tout << gid << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); - req->set_path(path); - req->args.chown.uid = uid; - req->args.chown.gid = gid; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl; - dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl; - tout << "utime" << endl; - tout << path << endl; - tout << buf->actime << endl; - tout << buf->modtime << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); - req->set_path(path); - req->args.utime.mtime.tv_sec = buf->modtime; - req->args.utime.mtime.tv_usec = 0; - req->args.utime.atime.tv_sec = buf->actime; - req->args.utime.atime.tv_usec = 0; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "utime result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl; - tout << "mknod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); - req->set_path(path); - req->args.mknod.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - - dout(10) << "mknod result is " << res << endl; - - delete reply; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - - -//readdir usually include inode info for each entry except of locked entries - -// -// getdir - -// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); - -int Client::getdir(const char *relpath, map& contents) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl; - tout << "getdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - - if (res == 0) { - - // dir contents to cache! - inodeno_t ino = reply->get_ino(); - Inode *diri = inode_map[ ino ]; - assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - - // add . and ..? - string dot("."); - contents[dot] = diri->inode; - if (diri != root) { - string dotdot(".."); - contents[dotdot] = diri->dn->dir->parent_inode->inode; - } - - // the rest? - if (!reply->get_dir_in().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - utime_t now = g_clock.real_now(); - - list::const_iterator pdn = reply->get_dir_dn().begin(); - for (list::const_iterator pin = reply->get_dir_in().begin(); - pin != reply->get_dir_in().end(); - ++pin, ++pdn) { - // ignore . - if (*pdn == ".") - continue; - - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_stat_ttl; - } - else if (g_conf.client_cache_readdir_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_readdir_ttl; - } - - // contents to caller too! - contents[*pdn] = in->inode; - } - if (dir->is_empty()) - close_dir(dir); - } - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } - - delete reply; //fix thing above first - - client_lock.Unlock(); - return res; -} - - -/** POSIX stubs **/ - -DIR *Client::opendir(const char *name) -{ - DirResult *d = new DirResult; - d->size = getdir(name, d->contents); - d->p = d->contents.begin(); - d->off = 0; - return (DIR*)d; -} - -int Client::closedir(DIR *dir) -{ - DirResult *d = (DirResult*)dir; - delete d; - return 0; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; - -struct dirent *Client::readdir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // move up - ++d->off; - ++d->p; - - return &d->dp.d_dirent; -} - -void Client::rewinddir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - d->p = d->contents.begin(); - d->off = 0; -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - return d->off; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - DirResult *d = (DirResult*)dirp; - - d->p = d->contents.begin(); - d->off = 0; - - if (offset >= d->size) offset = d->size-1; - while (offset > 0) { - ++d->p; - ++d->off; - --offset; - } -} - -struct dirent_plus *Client::readdirplus(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_stat(d->p->second, &d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = d->path; - path += "/"; - path += d->p->first; - d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} - -/* -struct dirent_lite *Client::readdirlite(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_statlite(d->p->second,d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = p->path; - path += "/"; - path += p->first; - d->dp.d_statlite - d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} -*/ - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl; - tout << "open" << endl; - tout << path << endl; - tout << flags << endl; - - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); - req->set_path(path); - req->args.open.flags = flags; - req->args.open.mode = mode; - - int cmode = req->get_open_file_mode(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - - assert(reply); - dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; - tout << reply->get_result() << endl; - - insert_trace(reply); - int result = reply->get_result(); - - // success? - fh_t fh = 0; - if (result >= 0) { - // yay - Fh *f = new Fh; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (cmode & FILE_MODE_R) f->inode->num_open_rd++; - if (cmode & FILE_MODE_W) f->inode->num_open_wr++; - if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++; - - // caps included? - int mds = reply->get_source().num(); - - if (f->inode->caps.empty()) {// first caps? - dout(7) << " first caps on " << f->inode->inode.ino << endl; - f->inode->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); - if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { - int old_caps = f->inode->caps[mds].caps; - - dout(7) << "open got caps " << cap_string(new_caps) - << " (had " << cap_string(old_caps) << ")" - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - - f->inode->caps[mds].caps = new_caps; - f->inode->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~f->inode->caps[mds].caps) == 0); - - if (g_conf.client_oc) - f->inode->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - } - - // put in map - result = fh = get_fh(); - assert(fh_map.count(fh) == 0); - fh_map[fh] = f; - - dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; - } else { - dout(0) << "open failure result " << result << endl; - } - - delete reply; - - trim_cache(); - client_lock.Unlock(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << endl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << endl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << endl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - -int Client::close(fh_t fh) -{ - client_lock.Lock(); - dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; - dout(3) << "op: open_files.erase( " << fh << " );" << endl; - tout << "close" << endl; - tout << fh << endl; - - // get Fh, Inode - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - if (f->mode & FILE_MODE_R) - in->num_open_rd--; - if (f->mode & FILE_MODE_W) - in->num_open_wr--; - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // hose fh - fh_map.erase(fh); - delete f; - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << endl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << endl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - int result = 0; - - client_lock.Unlock(); - return result; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(fh_t fh, off_t offset, int whence) -{ - client_lock.Lock(); - dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - client_lock.Unlock(); - - return pos; -} - - - -void Client::lock_fh_pos(Fh *f) -{ - dout(10) << "lock_fh_pos " << f << endl; - - if (f->pos_locked || !f->pos_waiters.empty()) { - Cond cond; - f->pos_waiters.push_back(&cond); - dout(10) << "lock_fh_pos BLOCKING on " << f << endl; - while (f->pos_locked || f->pos_waiters.front() != &cond) - cond.Wait(client_lock); - dout(10) << "lock_fh_pos UNBLOCKING on " << f << endl; - assert(f->pos_waiters.front() == &cond); - f->pos_waiters.pop_front(); - } - - f->pos_locked = true; -} - -void Client::unlock_fh_pos(Fh *f) -{ - dout(10) << "unlock_fh_pos " << f << endl; - f->pos_locked = false; -} - - -// blocking osd interface - -int Client::read(fh_t fh, char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl; - tout << "read" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - bool movepos = false; - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - movepos = true; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(10) << "file size: " << in->inode.size << endl; - if (offset > 0 && offset >= in->inode.size) { - if (movepos) unlock_fh_pos(f); - client_lock.Unlock(); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(10) << "read is size=0, returning 0" << endl; - if (movepos) unlock_fh_pos(f); - client_lock.Unlock(); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - bufferlist blist; // data will go here - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block. - } else { - // object cache OFF -- legacy inconsistent way. - - // do we have read file cap? - while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << endl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // do sync read - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, &blist); - if (in->hack_balance_reads || - g_conf.client_hack_balance_reads) - rd->balance_reads = true; - r = objecter->readx(rd, onfinish); - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - if (movepos) { - // adjust fd pos - f->pos = offset+blist.length(); - unlock_fh_pos(f); - } - - // copy data into caller's char* buf - blist.copy(0, blist.length(), buf); - - //dout(10) << "i read '" << blist.c_str() << "'" << endl; - dout(10) << "read rvalue " << rvalue << ", r " << r << endl; - - // done! - client_lock.Unlock(); - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; - dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl; - tout << "write" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // use/adjust fd pos? - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - f->pos = offset+size; - unlock_fh_pos(f); - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; - - // time it. - utime_t start = g_clock.real_now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << endl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << endl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << endl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << endl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << endl; - } - - // time - utime_t lat = g_clock.real_now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); - - // ok! - client_lock.Unlock(); - return totalwritten; -} - - -int Client::truncate(const char *file, off_t length) -{ - client_lock.Lock(); - dout(3) << "op: client->truncate(\"" << file << "\", " << length << ");" << endl; - tout << "truncate" << endl; - tout << file << endl; - tout << length << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->set_path(file); - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(10) << " truncate result is " << res << endl; - - client_lock.Unlock(); - return res; -} - - -int Client::fsync(fh_t fh, bool syncdataonly) -{ - client_lock.Lock(); - dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; - tout << "fsync" << endl; - tout << fh << endl; - tout << syncdataonly << endl; - - int r = 0; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << endl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - - client_lock.Unlock(); - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << endl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - bzero (stbuf, sizeof (struct statvfs)); - // FIXME - stbuf->f_bsize = 1024; - stbuf->f_frsize = 1024; - stbuf->f_blocks = 1024 * 1024; - stbuf->f_bfree = 1024 * 1024; - stbuf->f_bavail = 1024 * 1024; - stbuf->f_files = 1024 * 1024; - stbuf->f_ffree = 1024 * 1024; - stbuf->f_favail = 1024 * 1024; - stbuf->f_namemax = 1024; - - return 0; -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - -// ========================================= -// layout - - -int Client::describe_layout(int fh, FileLayout *lp) -{ - client_lock.Lock(); - dout(3) << "op: client->describe_layout(" << fh << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - client_lock.Unlock(); - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_unit; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_width(); -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.period(); -} - -int Client::enumerate_layout(int fh, list& result, - off_t length, off_t offset) -{ - client_lock.Lock(); - dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - client_lock.Unlock(); - return 0; -} - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << *m << " to " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << *m << " to " << inst << endl; - //failed_mds.insert(dest.num()); - } - else { - // client? - dout(0) << "ms_handle_failure " << *m << " to " << inst << ", dropping" << endl; - delete m; - } -} - diff --git a/branches/sage/pgs/client/Client.h b/branches/sage/pgs/client/Client.h deleted file mode 100644 index c0a44fe473296..0000000000000 --- a/branches/sage/pgs/client/Client.h +++ /dev/null @@ -1,682 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" -#include "msg/SerialMessenger.h" - -#include "messages/MClientReply.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" -#include "common/Timer.h" - -#include "FileCache.h" - - -// stl -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -class MClientSession; -class MClientRequest; -class MClientRequestForward; - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -typedef int fh_t; - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - utime_t valid_until; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - utime_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - // - bool hack_balance_reads; - // - - void make_path(string& p) { - if (dn) { - if (dn->dir && dn->dir->parent_inode) - dn->dir->parent_inode->make_path(p); - p += "/"; - p += dn->name; - } - } - - void get() { - ref++; - //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; - } - void put() { - ref--; assert(ref >= 0); - //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0, 0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0, 0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0), - hack_balance_reads(false) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { - return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; - } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER; - if (fc.is_cached()) w |= CAP_FILE_RDCACHE; - return w; - } - - int authority(MDSMap *mdsmap) { - //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; - // parent? - if (dn && dn->dir && dn->dir->parent_inode) { - // parent hashed? - if (dn->dir->parent_inode->dir_hashed) { - // hashed - assert(0); - // fixme - //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), - //dn->name ); - } - - if (dir_auth >= 0) - return dir_auth; - else - return dn->dir->parent_inode->authority(mdsmap); - } - - if (dir_auth >= 0) - return dir_auth; - - assert(0); // !!! - return 0; - } - int dentry_authority(const char *dn, - MDSMap *mdsmap) { - assert(0); - return 0; - //return ->hash_dentry( ino(), - //dn ); - } - int pick_replica(MDSMap *mdsmap) { - // replicas? - if (ino() > 1ULL && dir_contacts.size()) { - //cout << "dir_contacts if " << dir_contacts << endl; - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - int r = rand() % dir_contacts.size(); - while (r--) it++; - return *it; - } - } - - if (dir_replicated || ino() == 1) { - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(mdsmap); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); // pin inode - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - bool pos_locked; // pos is currently in use - list pos_waiters; // waiters for pos - - Fh() : inode(0), pos(0), mds(0), mode(0), pos_locked(false) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirResult { - string path; - map contents; - map::iterator p; - int off; - int size; - struct dirent_plus dp; - struct dirent_lite dl; - DirResult() : p(contents.end()), off(-1), size(0) {} - }; - - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - SafeTimer timer; - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds sessions - map mds_sessions; // mds -> push seq - map > waiting_for_session; - - void handle_client_session(MClientSession *m); - void send_reconnect(int mds); - - // mds requests - struct MetaRequest { - tid_t tid; - MClientRequest *request; - bufferlist request_payload; // in case i have to retry - - bool idempotent; // is request idempotent? - set mds; // who i am asking - int resend_mds; // someone wants you to (re)send the request here - int num_fwd; // # of times i've been forwarded - int retry_attempt; - - MClientReply *reply; // the reply - - Cond *caller_cond; // who to take up - Cond *dispatch_cond; // who to kick back - - MetaRequest(MClientRequest *req, tid_t t) : - tid(t), request(req), - idempotent(false), resend_mds(-1), num_fwd(0), retry_attempt(0), - reply(0), - caller_cond(0), dispatch_cond(0) { } - }; - tid_t last_tid; - map mds_requests; - set failed_mds; - - MClientReply *make_request(MClientRequest *req, int use_auth=-1); - int choose_target_mds(MClientRequest *req); - void send_request(MetaRequest *request, int mds); - void kick_requests(int mds); - void handle_client_request_forward(MClientRequestForward *reply); - void handle_client_reply(MClientReply *reply); - - bool mounted; - bool unmounting; - Cond mount_cond; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fh_set; // unused fh's - hash_map fh_map; - - fh_t get_fh() { - fh_t fh = free_fh_set.start(); - free_fh_set.erase(fh, 1); - return fh; - } - void put_fh(fh_t fh) { - free_fh_set.insert(fh, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in) { - in->put(); - if (in->ref == 0) { - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); // unpin inode - } - - int get_cache_size() { return lru.lru_get_size(); } - void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - //cout << "link dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << in->inode.ino << endl; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - in->dn = dn; - in->get(); - - if (in->dir) dn->get(); // dir -> dn pin - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - - // unlink from inode - if (dn->inode->dir) dn->put(); // dir -> dn pin - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dir *dir, const string& name, Inode *in) { - Dentry *olddn = in->dn; - Dir *olddir = olddn->dir; // note: might == dir! - - // newdn, attach to inode. don't touch inode ref. - Dentry *newdn = new Dentry; - newdn->name = name; - newdn->inode = in; - newdn->dir = dir; - in->dn = newdn; - - if (in->dir) { // dir -> dn pin - newdn->get(); - olddn->put(); - } - - // unlink old dn from dir - olddir->dentries.erase(olddn->name); - olddn->inode = 0; - olddn->dir = 0; - lru.lru_remove(olddn); - - // link new dn to dir - dir->dentries[name] = newdn; - lru.lru_insert_mid(newdn); - - // olddir now empty? (remember, olddir might == dir) - if (olddir->is_empty()) - close_dir(olddir); - - return newdn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - void fill_stat(inode_t& inode, struct stat *st); - void fill_statlite(inode_t& inode, struct statlite *st); - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_unmount(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - void lock_fh_pos(Fh *f); - void unlock_fh_pos(Fh *f); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_inode_dist(Inode *in, InodeStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. -private: - void _try_mount(); - void _mount_timeout(); - Context *mount_timeout_event; - - class C_MountTimeout : public Context { - Client *client; - public: - C_MountTimeout(Client *c) : client(c) { } - void finish(int r) { - if (r >= 0) client->_mount_timeout(); - } - }; - -public: - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - const string getcwd() { return cwd; } - - // namespace ops - int getdir(const char *path, list& contents); - int getdir(const char *path, map& contents); - - DIR *opendir(const char *name); - int closedir(DIR *dir); - struct dirent *readdir(DIR *dir); - void rewinddir(DIR *dir); - off_t telldir(DIR *dir); - void seekdir(DIR *dir, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int _lstat(const char *path, int mask, Inode **in); - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode); - int open(const char *path, int flags, mode_t mode=0); - int close(fh_t fh); - off_t lseek(fh_t fh, off_t offset, int whence); - int read(fh_t fh, char *buf, off_t size, off_t offset=-1); - int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); - int truncate(const char *file, off_t size); - //int truncate(fh_t fh, long long size); - int fsync(fh_t fh, bool syncdataonly); - - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/branches/sage/pgs/client/FileCache.cc b/branches/sage/pgs/client/FileCache.cc deleted file mode 100644 index 0c5b6b1c9440a..0000000000000 --- a/branches/sage/pgs/client/FileCache.cc +++ /dev/null @@ -1,264 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "config.h" -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << endl; - oc->purge_set(inode.ino); - } -} - -// truncate - -void FileCache::truncate(off_t olds, off_t news) -{ - dout(5) << "truncate " << olds << " -> " << news << endl; - - // map range to objects - list ls; - oc->filer.file_to_extents(inode, news, olds-news, ls); - oc->truncate_set(inode.ino, ls); -} - -// caps - -class C_FC_CheckCaps : public Context { - FileCache *fc; -public: - C_FC_CheckCaps(FileCache *f) : fc(f) {} - void finish(int r) { - fc->check_caps(); - } -}; - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << endl; - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); - - // kick waiters? (did we gain caps?) - if (can_read() && !waitfor_read.empty()) - for (set::iterator p = waitfor_read.begin(); - p != waitfor_read.end(); - ++p) - (*p)->Signal(); - if (can_write() && !waitfor_write.empty()) - for (set::iterator p = waitfor_write.begin(); - p != waitfor_write.end(); - ++p) - (*p)->Signal(); - -} - -int FileCache::get_used_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - return used; -} - -void FileCache::check_caps() -{ - // calc used - int used = get_used_caps(); - dout(10) << "check_caps used was " << cap_string(used) << endl; - - // try to implement caps? - // BUG? latest_caps, not least caps i've seen? - if ((latest_caps & CAP_FILE_RDCACHE) == 0 && - (used & CAP_FILE_RDCACHE)) - release_clean(); - if ((latest_caps & CAP_FILE_WRBUFFER) == 0 && - (used & CAP_FILE_WRBUFFER)) - flush_dirty(new C_FC_CheckCaps(this)); - - used = get_used_caps(); - dout(10) << "check_caps used now " << cap_string(used) << endl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << endl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // can i read? - while ((latest_caps & CAP_FILE_RD) == 0) { - dout(10) << "read doesn't have RD cap, blocking" << endl; - Cond c; - waitfor_read.insert(&c); - c.Wait(client_lock); - waitfor_read.erase(&c); - } - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // can i write - while ((latest_caps & CAP_FILE_WR) == 0) { - dout(10) << "write doesn't have WR cap, blocking" << endl; - Cond c; - waitfor_write.insert(&c); - c.Wait(client_lock); - waitfor_write.erase(&c); - } - - // inc writing counter - num_writing++; - - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/branches/sage/pgs/client/FileCache.h b/branches/sage/pgs/client/FileCache.h deleted file mode 100644 index fc4f715691b43..0000000000000 --- a/branches/sage/pgs/client/FileCache.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using namespace std; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - set waitfor_read; - set waitfor_write; - - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_safe_waiter(Context *c); - - void truncate(off_t olds, off_t news); - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - int get_used_caps(); - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/branches/sage/pgs/client/SyntheticClient.cc b/branches/sage/pgs/client/SyntheticClient.cc deleted file mode 100644 index a496480d7328e..0000000000000 --- a/branches/sage/pgs/client/SyntheticClient.cc +++ /dev/null @@ -1,1967 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" - -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "common/Logger.h" - -#include -#include -#include -#include -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - exclude = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"syn.%d.%d", client->whoami, seq); - a = s; - } - //cout << "a is " << a << endl; - return a; -} - -int SyntheticClient::run() -{ - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << endl; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << endl; - - switch (mode) { - case SYNCLIENT_MODE_FOO: - if (run_me()) - foo(); - break; - - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << endl; - sleep(iarg1); - } - } - break; - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << endl; - } - break; - case SYNCLIENT_MODE_EXCLUDE: - { - exclude = iargs.front(); - iargs.pop_front(); - if (exclude == client->get_nodeid()) - dout(2) << "not running " << exclude << endl; - } - break; - - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "until " << iarg1 << endl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << endl; - run_until = utime_t(0,0); - } - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "sleepuntil " << iarg1 << endl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << endl; - random_walk(iarg1); - } - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << endl; - make_dir_mess(sarg1.c_str(), iarg1); - } - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - - - case SYNCLIENT_MODE_THRASHLINKS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - int iarg4 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); - } - } - break; - - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, false); - } - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, true); - } - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << endl; - create_shared(num); - } - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << endl; - open_shared(num, count); - } - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << endl; - full_walk(sarg1); - } - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << endl; - while (full_walk(sarg1) == 0) ; - } - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - cout << "WRITING SYN CLIENT" << endl; - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_READSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - read_file(sarg1, iarg1, iarg2, true); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) - write_batch(iarg1, iarg2, iarg3); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - cout << "READING SYN CLIENT" << endl; - if (run_me()) - read_file(sarg1, iarg1, iarg2); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - cout << "RANDOM READ WRITE SYN CLIENT" << endl; - if (run_me()) - read_random(sarg1, iarg1, iarg2); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM_EX: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - cout << "RANDOM READ WRITE SYN CLIENT" << endl; - if (run_me()) - read_random_ex(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - - if (run_me()) { - dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl; - - Trace t(tfile.c_str()); - - client->mkdir(prefix.c_str(), 0755); - - for (int i=0; i 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - } - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test",0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - } - break; - - case SYNCLIENT_MODE_TRUNCATE: - { - string file = get_sarg(0); - sargs.push_front(file); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) - client->truncate(file.c_str(), iarg1); - } - break; - - default: - assert(0); - } - } - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << endl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix) -{ - dout(4) << "play trace" << endl; - t.start(); - - utime_t start = g_clock.now(); - - const char *p = prefix.c_str(); - - map open_files; - - while (!t.end()) { - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(); - dout(4) << "trace op " << op << endl; - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(p); - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - client->mknod(a, b); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(p); - map contents; - client->getdir(a, contents); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - int64_t id = t.get_int(); - int64_t fh = client->open(a, b); - open_files[id] = fh; - } else if (strcmp(op, "close") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - client->truncate(a,b); - } else if (strcmp(op, "read") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - client->read(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "lseek") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int off = t.get_int(); - int whence = t.get_int(); - client->lseek(fh, off, whence); - } else if (strcmp(op, "write") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - memset(buf, 1, size); // let's write 1's! - client->write(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "fsync") == 0) { - assert(0); - } else - assert(0); - } - - // close open files - for (map::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << endl; - if (fi->second > 0) client->close(fi->second); - } - - return 0; -} - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - map contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << endl; - return r; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = basedir + "/" + it->first; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - map contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << endl; - continue; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = dir + "/" + it->first; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - // print - char *tm = ctime(&st.st_mtime); - tm[strlen(tm)-1] = 0; - printf("%c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n", - S_ISDIR(st.st_mode) ? 'd':'-', - (st.st_mode & 0400) ? 'r':'-', - (st.st_mode & 0200) ? 'w':'-', - (st.st_mode & 0100) ? 'x':'-', - (st.st_mode & 040) ? 'r':'-', - (st.st_mode & 020) ? 'w':'-', - (st.st_mode & 010) ? 'x':'-', - (st.st_mode & 04) ? 'r':'-', - (st.st_mode & 02) ? 'w':'-', - (st.st_mode & 01) ? 'x':'-', - (int)st.st_nlink, - st.st_uid, st.st_gid, - (int)st.st_size, - tm, - file.c_str()); - - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(5); - } - } - - // files - struct stat st; - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - fds.push_back(fd); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << endl; - break; - } - - // verify fingerprint - int bad = 0; - int64_t *p = (int64_t*)buf; - int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - int64_t wantoff = i*rdsize + (int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad && !ignoreprint) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << endl; - bad++; - } - } - if (bad && !ignoreprint) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::read_random(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << endl; - - // cout << "READING FROM " << fn << " fd " << fd << endl; - - // cout << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << endl; - - if (fd < 0) return fd; - int offset; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //cout << "RANDOM NUMBER RETURN |" << x << "|" << endl; - - if ( x < 0.5) - { - //cout << "DECIDED TO READ " << x << endl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // cout << "DECIDED TO WRITE " << x << endl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //cout << "OFFSET is |" << offset << "| chunks |" << chunks<< endl; - - if ( read) - { - offset=(rand())%(chunks+1); - dout(2) << "reading block " << offset << "/" << chunks << endl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << endl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << endl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - //int count = rand()%10; - - //for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - //} - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << endl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -//#include -//#include - -int normdist(int min, int max, int stdev) /* specifies input values */; -//main() -//{ - // for ( int i=0; i < 10; i++ ) - // normdist ( 0 , 10, 1 ); - -//} - - -int normdist(int min, int max, int stdev) /* specifies input values */ -{ -/* min: Minimum value; max: Maximum value; stdev: degree of deviation */ - -//int min, max, stdev; { - time_t seconds; - time( &seconds); - srand(seconds); - - int range, iterate, result; -/* declare range, iterate and result as integers, to avoid the need for -floating point math*/ - - result = 0; -/* ensure result is initialized to 0 */ - - range = max -min; -/* calculate range of possible values between the max and min values */ - - iterate = range / stdev; -/* this number of iterations ensures the proper shape of the resulting -curve */ - - stdev += 1; /* compensation for integer vs. floating point math */ - for (int c = iterate; c != 0; c--) /* loop through iterations */ - { - // result += (uniform (1, 100) * stdev) / 100; /* calculate and - result += ( (rand()%100 + 1) * stdev) / 100; - // printf("result=%d\n", result ); - } - printf("\n final result=%d\n", result ); - return result + min; /* send final result back */ -} -int SyntheticClient::read_random_ex(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << endl; - - // cout << "READING FROM " << fn << " fd " << fd << endl; - - // cout << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << endl; - - if (fd < 0) return fd; - int offset; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //cout << "RANDOM NUMBER RETURN |" << x << "|" << endl; - - if ( x < 0.5) - { - //cout << "DECIDED TO READ " << x << endl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // cout << "DECIDED TO WRITE " << x << endl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //cout << "OFFSET is |" << offset << "| chunks |" << chunks<< endl; - - if ( read) - { - //offset=(rand())%(chunks+1); - - /* if ( chunks > 10000 ) - offset= normdist( 0 , chunks/1000 , 5 )*1000; - else if ( chunks > 1000 ) - offset= normdist( 0 , chunks/100 , 5 )*100; - else if ( chunks > 100 ) - offset= normdist( 0 , chunks/20 , 5 )*20;*/ - - - dout(2) << "reading block " << offset << "/" << chunks << endl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << endl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << endl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - int count = rand()%10; - - for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - } - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << endl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << endl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << endl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.push_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << endl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << endl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << endl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << endl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - map c; - r = client->getdir( cwd.c_str(), c ); - - for (map::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << it->first << endl; - contents[it->first] = it->second; - if (it->second.is_dir()) - subdirs.insert(it->first); - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << endl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << endl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; i> dir; - - // update dirs - dirs.push_back(parent); - dirs.push_back(dir); - dirs.push_back(dir); - - // do it - client->mkdir(dir.c_str(), 0755); - } - - -} - - - -void SyntheticClient::foo() -{ - if (1) { - // open some files - srand(0); - for (int i=0; i<20; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - char src[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - int fd = client->open(src, O_RDONLY); - } - - return; - } - - if (0) { - // rename fun - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); - client->rename(src, dst); - } - return; - } - - if (1) { - // link fun - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->link(src, dst); - } - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->unlink(dst); - } - - - return; - } - - // link fun - client->mknod("one", 0755); - client->mknod("two", 0755); - client->link("one", "three"); - client->mkdir("dir", 0755); - client->link("two", "/dir/twolink"); - client->link("dir/twolink", "four"); - - // unlink fun - client->mknod("a", 0644); - client->unlink("a"); - client->mknod("b", 0644); - client->link("b", "c"); - client->unlink("c"); - client->mkdir("d", 0755); - client->unlink("d"); - client->rmdir("d"); - - // rename fun - client->mknod("p1", 0644); - client->mknod("p2", 0644); - client->rename("p1","p2"); - client->mknod("p3", 0644); - client->rename("p3","p4"); - - // check dest dir ambiguity thing - client->mkdir("dir1", 0755); - client->mkdir("dir2", 0755); - client->rename("p2","dir1/p2"); - client->rename("dir1/p2","dir2/p2"); - client->rename("dir2/p2","/p2"); - - // check primary+remote link merging - client->link("p2","p2.l"); - client->link("p4","p4.l"); - client->rename("p2.l","p2"); - client->rename("p4","p4.l"); - - // check anchor updates - client->mknod("dir1/a", 0644); - client->link("dir1/a", "da1"); - client->link("dir1/a", "da2"); - client->link("da2","da3"); - client->rename("dir1/a","dir2/a"); - client->rename("dir2/a","da2"); - client->rename("da1","da2"); - client->rename("da2","da3"); - - // check directory renames - client->mkdir("dir3", 0755); - client->mknod("dir3/asdf", 0644); - client->mkdir("dir4", 0755); - client->mkdir("dir5", 0755); - client->mknod("dir5/asdf", 0644); - client->rename("dir3","dir4"); // ok - client->rename("dir4","dir5"); // fail -} - -int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) -{ - dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth - << " links " << n - << endl; - - if (time_to_stop()) return 0; - - for (int k=0; krename(dst.c_str(), "/tmp") == 0) { - client->rename(src.c_str(), dst.c_str()); - client->rename("/tmp", src.c_str()); - } - continue; - } - - // pick a dest dir - string src = basedir; - { - char t[80]; - for (int d=0; dmknod(src.c_str(), 0755); - client->rename(src.c_str(), dst.c_str()); - break; - case 1: - client->mknod(src.c_str(), 0755); - client->unlink(dst.c_str()); - client->link(src.c_str(), dst.c_str()); - break; - case 2: client->unlink(src.c_str()); break; - case 3: client->unlink(dst.c_str()); break; - //case 4: client->mknod(src.c_str(), 0755); break; - //case 5: client->mknod(dst.c_str(), 0755); break; - } - } - return 0; - - // now link shit up - for (int i=0; ilink(file.c_str(), ln.c_str()); - } - - return 0; -} - - diff --git a/branches/sage/pgs/client/SyntheticClient.h b/branches/sage/pgs/client/SyntheticClient.h deleted file mode 100644 index dc1cf58121d26..0000000000000 --- a/branches/sage/pgs/client/SyntheticClient.h +++ /dev/null @@ -1,231 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 -#define SYNCLIENT_MODE_READSHARED 24 -#define SYNCLIENT_MODE_RDWRRANDOM 25 -#define SYNCLIENT_MODE_RDWRRANDOM_EX 26 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_EXCLUDE 51 - -#define SYNCLIENT_MODE_UNTIL 52 -#define SYNCLIENT_MODE_SLEEPUNTIL 53 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - -#define SYNCLIENT_MODE_TRUNCATE 200 - -#define SYNCLIENT_MODE_FOO 100 -#define SYNCLIENT_MODE_THRASHLINKS 101 - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.push_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_dentry().length()) - r += cwd.last_dentry().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.push_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.push_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool exclude_me() { - if (exclude < 0) - return false; - if (exclude == client->get_nodeid()) { - exclude = -1; - return true; - } else { - exclude = -1; - return false; - } - } - bool run_me() { - if (exclude_me()) - return false; - - if (run_only >= 0) { - if (run_only == client->get_nodeid()) { - run_only = -1; - return true; - } - run_only = -1; - return false; - } - return true; - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - int exclude; - - string get_sarg(int seq); - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk, bool ignoreprint=false); - int read_random(string& fn, int mb, int chunk); - int read_random_ex(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix); - - void make_dir_mess(const char *basedir, int n); - void foo(); - - int thrash_links(const char *basedir, int dirs, int files, int depth, int n); - -}; - -#endif diff --git a/branches/sage/pgs/client/Trace.cc b/branches/sage/pgs/client/Trace.cc deleted file mode 100644 index 06a6447823036..0000000000000 --- a/branches/sage/pgs/client/Trace.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - -Mutex trace_lock; - -class TokenList { -public: - string filename; - char *data; - int len; - list tokens; - - int ref; - - TokenList() : data(0), ref(0) {} - ~TokenList() { - delete[] data; - } -}; - -map traces; - - -// -Trace::Trace(const char* f) -{ - string filename = f; - - trace_lock.Lock(); - - if (traces.count(filename)) - tl = traces[filename]; - else { - tl = new TokenList; - tl->filename = filename; - - // open file - crope cr; - int fd = open(filename.c_str(), O_RDONLY); - assert(fd > 0); - char buf[100]; - while (1) { - int r = read(fd, buf, 100); - if (r == 0) break; - assert(r > 0); - cr.append(buf, r); - } - close(fd); - - // copy - tl->len = cr.length()+1; - tl->data = new char[tl->len]; - memcpy(tl->data, cr.c_str(), cr.length()); - tl->data[tl->len-1] = '\n'; - - // index! - int o = 0; - while (o < tl->len) { - char *n = tl->data + o; - - // find newline - while (tl->data[o] != '\n') o++; - assert(tl->data[o] == '\n'); - tl->data[o] = 0; - - if (tl->data + o > n) tl->tokens.push_back(n); - o++; - } - - dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl; - traces[filename] = tl; - } - - tl->ref++; - - trace_lock.Unlock(); -} - -Trace::~Trace() -{ - trace_lock.Lock(); - - tl->ref--; - if (tl->ref == 0) { - traces.erase(tl->filename); - delete tl; - } - - trace_lock.Unlock(); -} - - -list& Trace::get_list() -{ - return tl->tokens; -} diff --git a/branches/sage/pgs/client/Trace.h b/branches/sage/pgs/client/Trace.h deleted file mode 100644 index bde9f2830cf5f..0000000000000 --- a/branches/sage/pgs/client/Trace.h +++ /dev/null @@ -1,76 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -using namespace std; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - class TokenList *tl; - - public: - Trace(const char* filename); - ~Trace(); - - list& get_list(); - - list::iterator _cur; - list::iterator _end; - - void start() { - _cur = get_list().begin(); - _end = get_list().end(); - ns = 0; - } - - char strings[10][200]; - int ns; - const char *get_string(const char *prefix = 0) { - assert(_cur != _end); - const char *s = *_cur; - _cur++; - if (prefix) { - if (strstr(s, "/prefix") == s || - strstr(s, "/prefix") == s+1) { - strcpy(strings[ns], prefix); - strcpy(strings[ns] + strlen(prefix), - s + strlen("/prefix")); - s = (const char*)strings[ns]; - ns++; - if (ns == 10) ns = 0; - } - } - return s; - } - __int64_t get_int() { - return atoll(get_string()); - } - bool end() { - return _cur == _end; - } -}; - -#endif diff --git a/branches/sage/pgs/client/fuse.cc b/branches/sage/pgs/client/fuse.cc deleted file mode 100644 index 855a3eb4a6766..0000000000000 --- a/branches/sage/pgs/client/fuse.cc +++ /dev/null @@ -1,281 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 25 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// stl -#include -using namespace std; - - -// globals -Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - - -static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) -{ - map contents; - - int res = client->getdir(path, contents); - if (res < 0) return res; - - // return contents to fuse via callback - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // (immutable) inode contents too. - res = filler(h, // fuse's handle - it->first.c_str(), // dentry as char* - it->second.mode & INODE_TYPE_MASK, // mask type bits from mode - it->second.ino); // ino.. 64->32 bit issue here? FIXME - if (res != 0) break; // fuse has had enough - } - return res; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags, 0); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->read(fh, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->write(fh, buf, size, offset); -} - -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ -//fh_t fh = fi->fh; - //return client->flush(fh); - return 0; -} - - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - - - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - int r = client->close(fh); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->fsync(fh, isdatasync ? true:false); -} - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: ceph_getdir, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: ceph_flush, - release: ceph_release, - fsync: ceph_fsync -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << endl; - int r = fuse_main(newargc, newargv, &ceph_oper); - return r; -} diff --git a/branches/sage/pgs/client/fuse.h b/branches/sage/pgs/client/fuse.h deleted file mode 100644 index dfacbaa4fdd85..0000000000000 --- a/branches/sage/pgs/client/fuse.h +++ /dev/null @@ -1,24 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/sage/pgs/client/hadoop/CephFSInterface.cc b/branches/sage/pgs/client/hadoop/CephFSInterface.cc deleted file mode 100644 index 7aa8c133d370b..0000000000000 --- a/branches/sage/pgs/client/hadoop/CephFSInterface.cc +++ /dev/null @@ -1,789 +0,0 @@ -#include "CephFSInterface.h" - -using namespace std; - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient - (JNIEnv *, jobject) -{ - - dout(3) << "CephFSInterface: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - // int r = monmap.read(".ceph_monmap"); - int r = monmap.read("/cse/grads/eestolan/ceph/trunk/ceph/.ceph_monmap"); - if (r < 0) { - dout(0) << "CephFSInterface: could not find .ceph_monmap" << endl; - assert(0 && "could not find .ceph_monmap"); - // return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // mount - client->mount(); - - jlong clientp = *(jlong*)&client; - return clientp; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile -(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) { - - dout(10) << "CephFSInterface: In copyFromLocalFile" << endl; - Client* client; - //client = (Client*) clientp; - client = *(Client**)&clientp; - - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - - dout(10) << "CephFSInterface: Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl; - struct stat st; - int r = ::stat(c_local_path, &st); - assert (r == 0); - - // open the files - int fh_local = ::open(c_local_path, O_RDONLY); - int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC); - assert (fh_local > -1); - assert (fh_ceph > -1); - dout(10) << "CephFSInterface: local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl; - - // get the source file size - off_t remaining = st.st_size; - - // copy the file a MB at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk)); - assert(got > 0); - remaining -= got; - off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile -(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path) -{ - Client* client; - client = *(Client**)&clientp; - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - - dout(3) << "CephFSInterface: dout(3): In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - cout << "CephFSInterface: cout: In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - - // get source file size - struct stat st; - //dout(10) << "Attempting lstat with file " << c_ceph_path << ":" << endl; - int r = client->lstat(c_ceph_path, &st); - assert (r == 0); - - dout(10) << "CephFSInterface: Opening Ceph source file for read: " << endl; - int fh_ceph = client->open(c_ceph_path, O_RDONLY); - assert (fh_ceph > -1); - - dout(10) << "CephFSInterface: Opened Ceph file! Opening local destination file: " << endl; - int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file a chunk at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - off_t remaining = st.st_size; - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - * Returns the current working directory. - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *env, jobject obj, jlong clientp) -{ - dout(10) << "CephFSInterface: In getcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - return (env->NewStringUTF(client->getcwd().c_str())); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - * - * Changes the working directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In setcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - * Removes an empty directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In rmdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - * Creates a directory with full permissions. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In mkdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - * Unlinks a path. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "CephFSInterface: In unlink for path " << c_path << ":" << endl; - - // is it a file or a directory? - struct stat stbuf; - int stat_result = client->lstat(c_path, &stbuf); - if (stat_result < 0) {// then the path doesn't even exist - dout(0) << "ceph_unlink: path " << c_path << " does not exist" << endl; - return false; - } - int result; - if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory - dout(10) << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl; - result = client->rmdir(c_path); - } - else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file - dout(10) << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl; - result = client->unlink(c_path); - } - else { - dout(0) << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl; - result = -1; - } - - dout(10) << "In ceph_unlink for path " << c_path << - ": got result " - << result << ". Returning..."<< endl; - - env->ReleaseStringUTFChars(j_path, c_path); - return (0 == result) ? JNI_TRUE : JNI_FALSE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - * Renames a file. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to) -{ - dout(10) << "CephFSInterface: In rename" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_from = env->GetStringUTFChars(j_from, 0); - const char* c_to = env->GetStringUTFChars(j_to, 0); - - return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_from, c_from); - env->ReleaseStringUTFChars(j_to, c_to); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - * Returns true if the path exists. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists -(JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - - dout(10) << "CephFSInterface: In exists" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "Attempting lstat with file " << c_path << ":" ; - int result = client->lstat(c_path, &stbuf); - dout(10) << "result is " << result << endl; - env->ReleaseStringUTFChars(j_path, c_path); - if (result < 0) { - dout(10) << "Returning false (file does not exist)" << endl; - return JNI_FALSE; - } - else { - dout(10) << "Returning true (file exists)" << endl; - return JNI_TRUE; - } -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - * Returns the block size. Size is -1 if the file - * does not exist. - * TODO: see if Hadoop wants something more like stripe size - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In getblocksize" << endl; - - Client* client; - //struct stat stbuf; - client = *(Client**)&clientp; - - jint result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - - /* - if (0 > client->lstat(c_path, &stbuf)) - result = -1; - else - result = stbuf.st_blksize; - */ - - // we need to open the file to retrieve the stripe size - dout(10) << "CephFSInterface: getblocksize: opening file" << endl; - int fh = client->open(c_path, O_RDONLY); - if (fh < 0) - return -1; - - result = client->get_stripe_unit(fh); - - int close_result = client->close(fh); - assert (close_result > -1); - - - env->ReleaseStringUTFChars(j_path, c_path); - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - * Returns the file size, or -1 on failure. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In getfilesize" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jlong result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) result = -1; - else result = stbuf.st_size; - env->ReleaseStringUTFChars(j_path, c_path); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In isfile" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a file... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - * Returns true if the path is a directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In isdirectory" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a directory... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - * Returns a Java array of Strings with the directory contents - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) { - - dout(10) << "In getdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - // get the directory listing - map contents; - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->getdir(c_path, contents); - env->ReleaseStringUTFChars(j_path, c_path); - - if (result < 0) return NULL; - - dout(10) << "checking for empty dir" << endl; - jint dir_size = contents.size(); - - // Hadoop freaks out if the listing contains "." or "..". Shrink - // the listing size by two, or by one if the directory is the root. - if(('/' == c_path[0]) && (0 == c_path[1])) - dir_size -= 1; - else - dir_size -= 2; - assert (dir_size >= 0); - - // Create a Java String array of the size of the directory listing - // jstring blankString = env->NewStringUTF(""); - jclass stringClass = env->FindClass("java/lang/String"); - if (NULL == stringClass) { - dout(0) << "ERROR: java String class not found; dying a horrible, painful death" << endl; - assert(0); - } - jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL); - - // populate the array with the elements of the directory list, - // omitting . and .. - int i = 0; - string dot("."); - string dotdot (".."); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // is it "."? - if (it->first == dot) continue; - if (it->first == dotdot) continue; - - if (0 == dir_size) - dout(0) << "CephFSInterface: WARNING: adding stuff to an empty array." << endl; - assert (i < dir_size); - env->SetObjectArrayElement(dirListingStringArray, i, - env->NewStringUTF(it->first.c_str())); - ++i; - } - - return dirListingStringArray; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - * Open a file for reading. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) - -{ - dout(10) << "In open_for_read" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - // open as read-only: flag = O_RDONLY - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_RDONLY); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - * Opens a file for overwriting; creates it if necessary. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In open_for_overwrite" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - * - * Closes the Ceph client. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *env, jobject obj, jlong clientp) -{ - Client* client; - client = *(Client**)&clientp; - - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); - - return true; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - * Reads into the given byte array from the current position. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In read" << endl; - - - // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we - // have to convert. The read is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the read - result = client->read((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - * Seeks to the given position. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephInputStream::seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - * Closes the file. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephOutputStream::ceph_seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In write" << endl; - - // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we - // have to convert. The write is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the write - result = client->write((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - diff --git a/branches/sage/pgs/client/hadoop/CephFSInterface.h b/branches/sage/pgs/client/hadoop/CephFSInterface.h deleted file mode 100644 index 549925aba6e64..0000000000000 --- a/branches/sage/pgs/client/hadoop/CephFSInterface.h +++ /dev/null @@ -1,239 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */ - -#include -#include "client/Client.h" -#include "config.h" -#include "client/fuse.h" -#include "msg/SimpleMessenger.h" -#include "common/Timer.h" - -#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#ifdef __cplusplus -extern "C" { -#endif - -#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE -#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient -(JNIEnv *, jobject); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *, jobject, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists - (JNIEnv *, jobject, jlong, jstring); - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *, jobject, jlong); - -#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE -#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */ - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/branches/sage/pgs/client/ldceph.cc b/branches/sage/pgs/client/ldceph.cc deleted file mode 100644 index 54f8290216832..0000000000000 --- a/branches/sage/pgs/client/ldceph.cc +++ /dev/null @@ -1,298 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/TCPMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/branches/sage/pgs/client/msgthread.h b/branches/sage/pgs/client/msgthread.h deleted file mode 100644 index 1e1af025b0d57..0000000000000 --- a/branches/sage/pgs/client/msgthread.h +++ /dev/null @@ -1,26 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "msg/Message.h" - -// send the message, expecting no response. threads other than the -// MPI thread use this function; if the MPI thread uses this function -// it could deadlock: this function could wait for the out queue to be -// emptied, but only the MPI thread can empty it. -void obfsmpi_send(Message *m) - -// send the message to a server and wait for the response. threads -// other than the MPI thread use this function. -Message *obfsmpi_sendrecv(Message *m) diff --git a/branches/sage/pgs/cmds.cc b/branches/sage/pgs/cmds.cc deleted file mode 100644 index 5965dffb55450..0000000000000 --- a/branches/sage/pgs/cmds.cc +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(MSG_ADDR_MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // done - delete mds; - - return 0; -} - diff --git a/branches/sage/pgs/cmon.cc b/branches/sage/pgs/cmon.cc deleted file mode 100644 index 2ce646b44f4fc..0000000000000 --- a/branches/sage/pgs/cmon.cc +++ /dev/null @@ -1,129 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(MSG_ADDR_MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/branches/sage/pgs/cmonctl.cc b/branches/sage/pgs/cmonctl.cc deleted file mode 100644 index 34bd80f9a4d8f..0000000000000 --- a/branches/sage/pgs/cmonctl.cc +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "msg/SimpleMessenger.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - - -Messenger *messenger = 0; - -class Admin : public Dispatcher { - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_MON_COMMAND_ACK: - dout(0) << m->get_source() << " -> '" - << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")" - << endl; - messenger->shutdown(); - break; - } - } -} dispatcher; - -int main(int argc, char **argv, char *envp[]) { - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - messenger = rank.register_entity(entity_name_t(entity_name_t::TYPE_ADMIN)); - messenger->set_dispatcher(&dispatcher); - - // build command - MMonCommand *m = new MMonCommand(messenger->get_myinst()); - string cmd; - for (unsigned i=0; icmd.push_back(string(args[i])); - } - int mon = monmap.pick_mon(); - - dout(0) << "mon" << mon << " <- '" << cmd << "'" << endl; - - // send it - messenger->send_message(m, monmap.get_inst(mon)); - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/pgs/common/Clock.cc b/branches/sage/pgs/common/Clock.cc deleted file mode 100644 index 8b07f6d9eb15f..0000000000000 --- a/branches/sage/pgs/common/Clock.cc +++ /dev/null @@ -1,20 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/branches/sage/pgs/common/Clock.h b/branches/sage/pgs/common/Clock.h deleted file mode 100644 index 1ea7227adebd4..0000000000000 --- a/branches/sage/pgs/common/Clock.h +++ /dev/null @@ -1,104 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include - -#include "Mutex.h" - -#include "include/utime.h" - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - //tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - void tare(utime_t z) { - zero = z; - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/branches/sage/pgs/common/Cond.h b/branches/sage/pgs/common/Cond.h deleted file mode 100644 index 4cb3d721b423f..0000000000000 --- a/branches/sage/pgs/common/Cond.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/branches/sage/pgs/common/DecayCounter.h b/branches/sage/pgs/common/DecayCounter.h deleted file mode 100644 index 37ce7bad07fa5..0000000000000 --- a/branches/sage/pgs/common/DecayCounter.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -class DecayCounter { - protected: - double val; // value - - double half_life; // in seconds - double k; // k = ln(.5)/half_life - - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - /* - DecayCounter(double hl) : val(0) { - set_halflife(hl); - reset(); - } - */ - - void adjust(double a) { - decay(); - val += a; - } - void adjust_down(const DecayCounter& other) { - // assume other has same time stamp as us... - val -= other.val; - } - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void take(DecayCounter& other) { - *this = other; - other.reset(); - } - - void reset() { - last_decay.sec_ref() = 0; - last_decay.usec_ref() = 0; - val = 0; - } - - void decay() { - utime_t el = g_clock.recent_now(); - el -= last_decay; - if (el.sec() >= 1) { - val = val * exp((double)el * k); - if (val < .01) val = 0; - last_decay = g_clock.recent_now(); - } - } - - double get() { - decay(); - return val; - } - - double hit(double v = 1.0) { - decay(); - val += v; - return val; - } - -}; - - -#endif diff --git a/branches/sage/pgs/common/LogType.h b/branches/sage/pgs/common/LogType.h deleted file mode 100644 index 9c45448d49590..0000000000000 --- a/branches/sage/pgs/common/LogType.h +++ /dev/null @@ -1,112 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map keymap; - vector keys; - set inc_keys; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - intptr_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - - intptr_t p = (intptr_t)key; - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { - intptr_t p = (intptr_t)key; - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; - -Logger::Logger(string fn, LogType *type) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - delete cwd; - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - //cout << "log " << filename << endl; - interval = g_conf.log_interval; - - if (!g_conf.clock_tare) - start = g_clock.now(); // time 0! otherwise g_clock does it for us. - - last_logged = 0; - wrote_header = -1; - open = false; - this->type = type; - wrote_header_last = 0; - - version = 0; - } - logger_lock.Unlock(); - flush(false); -} - -Logger::~Logger() -{ - flush(true); - out.close(); -} - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - long r = 0; - if (i >= 0 && (int)vals.size() > i) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - -void Logger::flush(bool force) -{ - if (!g_conf.log) return; - logger_lock.Lock(); - - if (version != type->version) { - while (type->keys.size() > vals.size()) - vals.push_back(0); - while (type->keys.size() > fvals.size()) - fvals.push_back(0); - version = type->version; - } - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl; - assert(0); - start = fromstart; - } - fromstart -= start; - - while (force || - ((fromstart.sec() > last_logged) && - (fromstart.sec() - last_logged >= interval))) { - last_logged += interval; - force = false; - - //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl; - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) - out << "\t" << type->keys[i]; - out << endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - // write line to log - out << last_logged; - for (unsigned i=0; ikeys.size(); i++) { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else - out << "\t" << vals[i]; - } - out << endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } - } - - logger_lock.Unlock(); -} - - - - diff --git a/branches/sage/pgs/common/Logger.h b/branches/sage/pgs/common/Logger.h deleted file mode 100644 index e1c2c37ed3006..0000000000000 --- a/branches/sage/pgs/common/Logger.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" -#include "Mutex.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "LogType.h" - - - - -class Logger { - protected: - //hash_map, eqstr> vals; - //hash_map, eqstr> fvals; - vector vals; - vector fvals; - - //Mutex lock; - LogType *type; - - utime_t start; - int last_logged; - int interval; - int wrote_header; - int wrote_header_last; - - string filename; - - int version; - - ofstream out; - bool open; - - public: - Logger(string fn, LogType *type); - ~Logger(); - - void set_start(const utime_t& a) { start = a; } - utime_t& get_start() { return start; } - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - - void flush(bool force = false); -}; - -#endif diff --git a/branches/sage/pgs/common/Mutex.h b/branches/sage/pgs/common/Mutex.h deleted file mode 100755 index 724c4dbed2a76..0000000000000 --- a/branches/sage/pgs/common/Mutex.h +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; - - -public: - class Locker { - Mutex &mutex; - - public: - Locker(Mutex& m) : mutex(m) { - mutex.Lock(); - } - ~Locker() { - mutex.Unlock(); - } - }; -}; - -#endif diff --git a/branches/sage/pgs/common/Semaphore.h b/branches/sage/pgs/common/Semaphore.h deleted file mode 100644 index bc0a9e60d7ffa..0000000000000 --- a/branches/sage/pgs/common/Semaphore.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/pgs/common/Thread.h b/branches/sage/pgs/common/Thread.h deleted file mode 100644 index 2fd81a6217733..0000000000000 --- a/branches/sage/pgs/common/Thread.h +++ /dev/null @@ -1,77 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - protected: - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - bool am_self() { return (pthread_self() == thread_id); } - - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - int join(void **prval = 0) { - if (thread_id == 0) { - cerr << "WARNING: join on thread that was never started" << endl; - //assert(0); - return -EINVAL; // never started. - } - - int status = pthread_join(thread_id, prval); - if (status != 0) { - switch (status) { - case -EINVAL: - cerr << "thread " << thread_id << " join status = EINVAL" << endl; - break; - case -ESRCH: - cerr << "thread " << thread_id << " join status = ESRCH" << endl; - assert(0); - break; - case -EDEADLK: - cerr << "thread " << thread_id << " join status = EDEADLK" << endl; - break; - default: - cerr << "thread " << thread_id << " join status = " << status << endl; - } - assert(0); // none of these should happen. - } - thread_id = 0; - return status; - } - -}; - -#endif diff --git a/branches/sage/pgs/common/ThreadPool.h b/branches/sage/pgs/common/ThreadPool.h deleted file mode 100644 index 5bc1a7fa57b5a..0000000000000 --- a/branches/sage/pgs/common/ThreadPool.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using namespace std; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) cout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/branches/sage/pgs/common/Timer.cc b/branches/sage/pgs/common/Timer.cc deleted file mode 100644 index 1ddf5d18e8bbf..0000000000000 --- a/branches/sage/pgs/common/Timer.cc +++ /dev/null @@ -1,334 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug) cerr << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << endl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << endl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << endl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << endl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << endl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << endl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << endl; - } else { - dout(DBL) << "sleeping" << endl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << endl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << endl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << endl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << endl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << endl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << endl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << endl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(float seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when.sec_ref() += (int)seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << endl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << endl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << endl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(float seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - while (!canceled.empty()) { - // wait - dout(-10) << "SafeTimer.join waiting for " << canceled.size() << " to join" << endl; - dout(-10) << canceled << endl; - cond.Wait(lock); - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << endl; - } -} diff --git a/branches/sage/pgs/common/Timer.h b/branches/sage/pgs/common/Timer.h deleted file mode 100644 index 46095d98312e1..0000000000000 --- a/branches/sage/pgs/common/Timer.h +++ /dev/null @@ -1,174 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(float seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(float seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/branches/sage/pgs/config.cc b/branches/sage/pgs/config.cc deleted file mode 100644 index fdefb01895b09..0000000000000 --- a/branches/sage/pgs/config.cc +++ /dev/null @@ -1,903 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - -#include "osd/osd_types.h" -Mutex _dout_lock; - -FileLayout g_OSD_FileLayout( 1<<23, 1, 1<<23, pg_t::TYPE_REP, 2 ); // 8M objects, 2x replication -FileLayout g_OSD_MDDirLayout( 1<<23, 1, 1<<23, pg_t::TYPE_REP, 2 ); // 8M objects, 2x replication. (a lie, just object layout policy) -FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, pg_t::TYPE_REP, 2 ); // 1M objects -FileLayout g_OSD_MDAnchorTableLayout( 1<<20, 1, 1<<20, pg_t::TYPE_REP, 2 ); // 1M objects. (a lie, just object layout policy) - -#include - -// fake osd failures: osd -> time -std::map g_fake_kill_after; -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_buffer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 0, - debug_paxos: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - clock_tare: false, - - // --- messenger --- - ms_single_dispatch: false, - ms_requeue_on_sender_fail: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - ms_die_on_failure: false, - - /*tcp_skip_rank0: false, - tcp_overlay_clients: false, // over osds! - tcp_log: false, - tcp_serial_marshall: true, - tcp_serial_out: false, - tcp_multi_out: true, - tcp_multi_dispatch: false, // not fully implemented yet - */ - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 5, // seconds // lease interval - mon_lease_renew_interval: 3, // on leader, to renew the lease - mon_lease_ack_timeout: 10.0, // on leader, if lease isn't acked by all peons - mon_lease_timeout: 10.0, // on peon, if lease isn't extended - mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted - mon_stop_on_last_unmount: false, - mon_stop_with_last_mds: false, - - // --- client --- - client_cache_size: 300, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_oc: true, - client_oc_size: 1024*1024* 5, // MB * n - client_oc_max_dirty: 1024*1024* 5, // MB * n - client_oc_max_sync_write: 128*1024, // writes >= this use wrlock - - client_mount_timeout: 10.0, // retry every N seconds - - client_hack_balance_reads: false, - - client_trace: 0, - fuse_direct_io: 0, - - // --- objecter --- - objecter_buffer_uncommitted: true, - - // --- journaler --- - journaler_allow_split_entries: true, - journaler_safe: false, // wait for COMMIT on journal writes - journaler_write_head_interval: 15, - - // --- mds --- - mds_cache_size: MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 30, - - mds_beacon_interval: 5, //30.0, - mds_beacon_grace: 15, //60*60.0, - - mds_log: true, - mds_log_max_len: MDS_CACHE_SIZE / 3, - mds_log_max_trimming: 10000, - mds_log_read_inc: 1<<20, - mds_log_pad_entry: 128,//256,//64, - mds_log_flush_on_shutdown: true, - mds_log_subtree_map_interval: 128*1024, // frequency (in bytes) of EImportMap in log - mds_log_eopen_size: 100, // # open inodes per log entry - - mds_bal_replicate_threshold: 2000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_hash_rd: 10000, - mds_bal_unhash_rd: 1000, - mds_bal_hash_wr: 10000, - mds_bal_unhash_wr: 1000, - mds_bal_interval: 30, // seconds - mds_bal_hash_interval: 5, // seconds - mds_bal_idle_threshold: .1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_trim_on_rejoin: true, - mds_commit_on_shutdown: true, - mds_shutdown_check: 0, //30, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - - mds_thrash_exports: 0, - mds_dump_cache_on_map: false, - mds_dump_cache_after_rejoin: true, - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - - osd_balance_reads: false, - osd_immediate_read_from_cache: true, // osds to read from the cache immediately? - osd_exclusive_caching: true, // replicas evict replicated writes - osd_load_diff_percent: 20, // load diff for read forwarding - osd_flash_crowd_iat_threshold: 100, - osd_flash_crowd_iat_alpha: 0.125, - - osd_pg_bits: 0, // 0 == let osdmonitor decide - osd_object_layout: OBJECT_LAYOUT_HASHINO, - osd_pg_layout: PG_LAYOUT_CRUSH, - osd_max_rep: 4, - osd_min_raid_width: 4, - osd_max_raid_width: 6, - - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 5, // shut up while i'm debugging - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - // --- fakestore --- - fakestore_fake_sync: 2, // 2 seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 20, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (60 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: true, - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- obfs --- - uofs: 0, - uofs_fake_sync: 2, // 2 seconds - uofs_cache_size: 1 << 28, //256MB - uofs_onode_size: (int)1024, - uofs_small_block_size: (int)4096, //4KB - uofs_large_block_size: (int)524288, //512KB - uofs_segment_size: (int)268435456, //256MB - uofs_block_meta_ratio: (int)10, - uofs_sync_write: (int)0, - uofs_nr_hash_buckets: (int)1023, - uofs_flush_interval: (int)5, //seconds - uofs_min_flush_pages: (int)1024, //4096 4k-pages - uofs_delay_allocation: (int)1, //true - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet ** - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: true, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << endl; - return false; // should have 3 periods - } - if (count == 3 && *s != ':') { - cerr << "expected : at " << off << endl; - return false; // then a colon - } - s++; off++; - - if (count <= 3) - a.ipq[count] = val; - else - a.port = val; - - count++; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern class FileLayout g_OSD_FileLayout; -extern class FileLayout g_OSD_MDDirLayout; -extern class FileLayout g_OSD_MDLogLayout; -extern class FileLayout g_OSD_MDAnchorTableLayout; - -#include -#include - -#include "common/Mutex.h" - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_buffer; - int debug_filer; - int debug_objecter; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - int debug_paxos; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - bool clock_tare; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_single_dispatch; - bool ms_requeue_on_sender_fail; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - bool ms_die_on_failure; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - float mon_lease_renew_interval; - float mon_lease_ack_timeout; - float mon_lease_timeout; - float mon_accept_timeout; - bool mon_stop_on_last_unmount; - bool mon_stop_with_last_mds; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - double client_mount_timeout; - - // hack - bool client_hack_balance_reads; - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - int client_trace; - int fuse_direct_io; - - // objecter - bool objecter_buffer_uncommitted; - - // journaler - bool journaler_allow_split_entries; - bool journaler_safe; - int journaler_write_head_interval; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_len; - int mds_log_max_trimming; - int mds_log_read_inc; - int mds_log_pad_entry; - bool mds_log_flush_on_shutdown; - off_t mds_log_subtree_map_interval; - int mds_log_eopen_size; - - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - float mds_bal_hash_rd; - float mds_bal_unhash_rd; - float mds_bal_hash_wr; - float mds_bal_unhash_wr; - int mds_bal_interval; - int mds_bal_hash_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_trim_on_rejoin; - bool mds_commit_on_shutdown; - int mds_shutdown_check; - - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - - int mds_thrash_exports; - bool mds_dump_cache_on_map; - bool mds_dump_cache_after_rejoin; - - // osd - int osd_rep; - - bool osd_balance_reads; - bool osd_immediate_read_from_cache; - bool osd_exclusive_caching; - int osd_load_diff_percent; - int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms - double osd_flash_crowd_iat_alpha; - - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_min_raid_width; - int osd_max_raid_width; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - int fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - int uofs; - int uofs_fake_sync; - int uofs_cache_size; - int uofs_onode_size; - int uofs_small_block_size; - int uofs_large_block_size; - int uofs_segment_size; - int uofs_block_meta_ratio; - int uofs_sync_write; - - int uofs_nr_hash_buckets; - int uofs_flush_interval; - int uofs_min_flush_pages; - int uofs_delay_allocation; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - - -/** - * debug output framework - */ -#define dout(x) if ((x) <= g_conf.debug) std::cout -#define dout2(x) if ((x) <= g_conf.debug) std::cout - -#define pdout(x,p) if ((x) <= (p)) std::cout - -/** - * for cleaner output, bracket each line with - * dbeginl (in the dout macro) and dendl (in place of endl). - */ -extern Mutex _dout_lock; -struct _dbeginl_t { - _dbeginl_t(int) {} -}; -struct _dendl_t { - _dendl_t(int) {} -}; -static const _dbeginl_t dbeginl = 0; -static const _dendl_t dendl = 0; - -inline ostream& operator<<(ostream& out, _dbeginl_t) { - _dout_lock.Lock(); - return out; -} -inline ostream& operator<<(ostream& out, _dendl_t) { - out << endl; - _dout_lock.Unlock(); - return out; -} - - -/** - * command line / environment argument parsing - */ -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - - -#endif diff --git a/branches/sage/pgs/cosd.cc b/branches/sage/pgs/cosd.cc deleted file mode 100644 index 4f3c8ab71a19f..0000000000000 --- a/branches/sage/pgs/cosd.cc +++ /dev/null @@ -1,135 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.clock_tare) g_clock.tare(); - - // osd specific args - char *dev = 0; - char dev_default[20]; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << endl; - } else { - cout << "command line arg says i am osd" << whoami << endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(MSG_ADDR_OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/branches/sage/pgs/crush/BinaryTree.h b/branches/sage/pgs/crush/BinaryTree.h deleted file mode 100644 index c9b91e19db970..0000000000000 --- a/branches/sage/pgs/crush/BinaryTree.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -//#include -using namespace std; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/branches/sage/pgs/crush/Bucket.h b/branches/sage/pgs/crush/Bucket.h deleted file mode 100644 index 81a2576697bd7..0000000000000 --- a/branches/sage/pgs/crush/Bucket.h +++ /dev/null @@ -1,632 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/branches/sage/pgs/crush/Hash.h b/branches/sage/pgs/crush/Hash.h deleted file mode 100644 index 2f0d9e4db918b..0000000000000 --- a/branches/sage/pgs/crush/Hash.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/branches/sage/pgs/crush/crush.h b/branches/sage/pgs/crush/crush.h deleted file mode 100644 index 4c03994ba82fe..0000000000000 --- a/branches/sage/pgs/crush/crush.h +++ /dev/null @@ -1,535 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for first. - list force_stack; - if (forcefeed >= 0) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << endl; - - if (!force_stack.empty()) { - int forceval = force_stack.front(); - force_stack.pop_front(); - assert(arg == forceval); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << endl; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/branches/sage/pgs/crush/test/cluster_movement_rush.cc b/branches/sage/pgs/crush/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/branches/sage/pgs/crush/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/pgs/crush/test/depth_variance.cc b/branches/sage/pgs/crush/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/branches/sage/pgs/crush/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/pgs/crush/test/movement.cc b/branches/sage/pgs/crush/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/branches/sage/pgs/crush/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/pgs/crush/test/sizes.cc b/branches/sage/pgs/crush/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/branches/sage/pgs/crush/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/branches/sage/pgs/crush/test/speed_depth.cc b/branches/sage/pgs/crush/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/branches/sage/pgs/crush/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/branches/sage/pgs/crush/test/t.cc b/branches/sage/pgs/crush/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/branches/sage/pgs/crush/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/branches/sage/pgs/crush/test/testbucket.cc b/branches/sage/pgs/crush/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/branches/sage/pgs/crush/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << endl; - for (int i=0; iinit(); - - // start syntheticclient - SyntheticClient *syn = new SyntheticClient(client); - - client->mount(); - - syn->start_thread(); - - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - - // wait - syn->join_thread(); - - // unmount - client->unmount(); - client->shutdown(); - - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/pgs/doc/Commitdir.txt b/branches/sage/pgs/doc/Commitdir.txt deleted file mode 100644 index 05c727be60ae6..0000000000000 --- a/branches/sage/pgs/doc/Commitdir.txt +++ /dev/null @@ -1,24 +0,0 @@ -OLD - - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/branches/sage/pgs/doc/anchortable.txt b/branches/sage/pgs/doc/anchortable.txt deleted file mode 100644 index d9c0fefc31e08..0000000000000 --- a/branches/sage/pgs/doc/anchortable.txt +++ /dev/null @@ -1,54 +0,0 @@ - -ANCHOR TABLE PROTOCOL - -MDS sends an update PREPARE to the anchortable MDS. The prepare is -identified by the ino and operation type; only one for each type -(create, update, destroy) can be pending at any time. Both parties -may actually be the same local node, but for simplicity we treat that -situation the same. (That is, we act as if they may fail -independently, even if they can't.) - -The anchortable journals the proposed update, and responds with an -AGREE and a version number. This uniquely identifies the request. - -The MDS can then update the filesystem metadata however it sees fit. -When it is finished (and the results journaled), it sends a COMMIT to -the anchortable. The table journals the commit, frees any state from -the transaction, and sends an ACK. The initiating MDS should then -journal the ACK to complete the transaction. - - -ANCHOR TABLE FAILURE - -If the AT fails before journaling the PREPARE and sending the AGREE, -the initiating MDS will simply retry the request. - -If the AT fails after journaling PREPARE but before journaling COMMIT, -it will resend AGREE to the initiating MDS. - -If the AT fails after the COMMIT, the transaction has been closed, and it -takes no action. If it receives a COMMIT for which it has no open -transaction, it will reply with ACK. - - -INITIATING MDS FAILURE - -If the MDS fails before the metadata update has been journaled, no -action is taken, since nothing is known about the previously proposed -transaction. If an AGREE message is received and there is no -corresponding PREPARE or pending-commit state, and ROLLBACK is sent to -the anchor table. - -If the MDS fails after journaling the metadata update but before -journaling the ACK, it resends COMMIT to the anchor table. If it -receives an AGREE after resending the COMMIT, it simply ignores the -AGREE. The anchortable will respond with an ACK, allowing the -initiating MDS to journal the final ACK and close out the transaction -locally. - -On journal replay, each metadata update (EMetaBlob) encountered that -includes an anchor transaction is noted in the AnchorClient by adding -it to the pending_commit list, and each journaled ACK is removed from -that list. Journal replay may enounter ACKs with no prior metadata -update; these are ignored. When recovery finishes, a COMMIT is sent -for all outstanding transactions. diff --git a/branches/sage/pgs/doc/bdb.txt b/branches/sage/pgs/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/branches/sage/pgs/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/branches/sage/pgs/doc/caching.txt b/branches/sage/pgs/doc/caching.txt deleted file mode 100644 index 161eaf7428a53..0000000000000 --- a/branches/sage/pgs/doc/caching.txt +++ /dev/null @@ -1,303 +0,0 @@ - -SPANNING TREE PROPERTY - -All metadata that exists in the cache is attached directly or -indirectly to the root inode. That is, if the /usr/bin/vi inode is in -the cache, then /usr/bin, /usr, and / are too, including the inodes, -directory objects, and dentries. - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a nonce (initial 0) to -disambiguate multiple replicas of the same item (see below). - - map replicas; // maps replicating mds# to nonce - -The cached_by set _always_ includes all nodes that cache the -partcuarly object, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. That is, we have two invariants: - - 1) the authority's replica set will always include all actual - replicas, and - - 2) cache expiration notices will be reliably delivered to the - authority. - -The second invariant is particularly important because the presence of -replicas will pin the metadata object in memory on the authority, -preventing it from being trimmed from the cache. Notification of -expiration of the replicas is required to allow previously replicated -objects from eventually being trimmed from the cache as well. - -Each metdata object has a authority bit that indicates whether it is -authoritative or a replica. - - -REPLICA NONCE - -Each replicated object maintains a "nonce" value, issued by the -authority at the time the replica was created. If the authority has -already created a replica for the given MDS, the new replica will be -issues a new (incremented) nonce. This nonce is attached -to cache expirations, and allows the authority to disambiguate -expirations when multiple replicas of the same object are created and -cache expiration is coincident with replication. That is, when an -old replica is expired from the replicating MDS at the same time that -a new replica is issued by the authority and the resulting messages -cross paths, the authority can tell that it was the old replica that -was expired and effectively ignore the expiration message. The -replica is removed from the replicas map only if the nonce matches. - - -SUBTREE PARTITION - -Authority of the file system namespace is partitioned using a -subtree-based partitioning strategy. This strategy effectively -separates directory inodes from directory contents, such that the -directory contents are the unit of redelegation. That is, if / is -assigned to mds0 and /usr to mds1, the inode for /usr will be managed -by mds0 (it is part of the / directory), while the contents of /usr -(and everything nested beneath it) will be managed by mds1. - -The description for this partition exists solely in the collective -memory of the MDS cluster and in the individual MDS journals. It is -not described in the regular on-disk metadata structures. This is -related to the fact that authority delegation is a property of the -{\it directory} and not the directory's {\it inode}. - -Subsequently, if an MDS is authoritative for a directory inode and does -not yet have any state associated with the directory in its cache, -then it can assume that it is also authoritative for the directory. - -Directory state consists of a data object that describes any cached -dentries contained in the directory, information about the -relationship between the cached contents and what appears on disk, and -any delegation of authority. That is, each CDir object has a dir_auth -element. Normally dir_auth has a value of AUTH_PARENT, meaning that -the authority for the directory is the same as the directory's inode. -When dir_auth specifies another metadata server, that directory is -point of authority delegation and becomes a {\it subtree root}. A -CDir is a subtree root iff its dir_auth specifies an MDS id (and is not -AUTH_PARENT). - - - A dir is a subtree root iff dir_auth != AUTH_PARENT. - - - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the - converse may not be true. - -The authority for any metadata object in the cache can be determined -by following the parent pointers toward the root until a subtree root -CDir object is reached, at which point the authority is specified by -its dir_auth. - -Each MDS cache maintains a subtree data structure that describes the -subtree partition for all objects currently in the cache: - - map< CDir*, set > subtrees; - - - A dir will appear in the subtree map (as a key) IFF it is a subtree - root. - -Each subtree root will have an entry in the map. The map value is a -set of all other subtree roots nested beneath that point. Nested -subtree roots effectively bound or prune a subtree. For example, if -we had the following partition: - - mds0 / - mds1 /usr - mds0 /usr/local - mds0 /home - -The subtree map on mds0 would be - - / -> (/usr, /home) - /usr/local -> () - /home -> () - -and on mds1: - - /usr -> (/usr/local) - - -AMBIGUOUS DIR_AUTH - -While metadata for a subtree is being migrated between two MDS nodes, -the dir_auth for the subtree root is allowed to be ambiguous. That -is, it will specify both the old and new MDS ids, indicating that a -migration is in progress. - -If a replicated metadata object is expired from the cache from a -subtree whose authority is ambiguous, the cache expiration is sent to -both potential authorities. This ensures that the message will be -reliably delivered, even if either of those nodes fails. A number of -alternative strategies were considered. Sending the expiration to the -old or new authority and having it forwarded if authority has been -delegated can result in message loss if the forwarding node fails. -Pinning ambiguous metadata in cache is computationally expensive for -implementation reasons, and while delaying the transmission of expiration -messages is difficult to implement because the replicating must send -the final expiration messages when the subtree authority is -disambiguated, forcing it to keep certain elements of it cache in -memory. Although duplicated expirations incurs a small communications -overhead, the implementation is much simpler. - - -AUTH PINS - -Most operations that modify metadata must allow some amount of time to -pass in order for the operation to be journaled or for communication -to take place between the object's authority and any replicas. For -this reason it must not only be pinned in the authority's metadata -cache, but also be locked such that the object's authority is not -allowed to change until the operation completes. This is accomplished -using {\it auth pins}, which increment a reference counter on the -object in question, as well as all parent metadata objects up to the -root of the subtree. As long as the pin is in place, it is impossible -for that subtree (or any fragment of it that contains one or more -pins) to be migrated to a different MDS node. Pins can be placed on -both inodes and directories. - -Auth pins can only exist for authoritative metadata, because they are -only created if the object is authoritative, and their presense -prevents the migration of authority. - - -FREEZING - -More specifically, auth pins prevent a subtree from being frozen. -When a subtree is frozen, all updates to metadata are forbidden. This -includes updates to the replicas map that describes which replicas -(and nonces) exist for each object. - -In order for metadata to be migrated between MDS nodes, it must first -be frozen. The root of the subtree is initially marked as {\it -freezing}. This prevents the creation of any new auth pins within the -subtree. After all existing auth pins are removed, the subtree is -then marked as {\it frozen}, at which point all updates are -forbidden. This allows metadata state to be packaged up in a message -and transmitted to the new authority, without worrying about -intervening updates. - -If the directory at the base of a freezing or frozen subtree is not -also a subtree root (that is, it has dir_auth == AUTH_PARENT), the -directory's parent inode is auth pinned. - - - a frozen tree root dir will auth_pin its inode IFF it is auth AND - not a subtree root. - -This prevents a parent directory from being concurrently frozen, and a -range of resulting implementation complications relating metadata -migration. - - -CACHE EXPIRATION FOR FROZEN SUBTREES - -Cache expiration messages that are received for a subtree that is -frozen are temporarily set aside instead of being processed. Only -when the subtree is unfrozen are the expirations either processed (if -the MDS is authoritative) or discarded (if it is not). Because either -the exporting or importing metadata can fail during the migration -process, the MDS cannot tell whether it will be authoritative or not -until the process completes. - -During a migration, the subtree will first be frozen on both the -exporter and importer, and then all other replicas will be informed of -a subtrees ambiguous authority. This ensures that all expirations -during migration will go to both parties, and nothing will be lost in -the event of a failure. - - - - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen, and the path must be pinned (\ie not conflicted -with a rename). If these conditions are met, the subtree root -directory is temporarily auth pinned, the subtree freeze is initiated, -and the exporter is committed to the subtree migration, barring an -intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the inode for the -base directory being exported is open on the destination node. It is -pinned by the importer to prevent it from being trimmed. This occurs -before the exporter completes the freeze of the subtree to ensure that -the importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed by -removing its temporary auth pin. - -The MExportPrep message then follows to populate the importer with a -spanning tree that includes all dirs, inodes, and dentries necessary -to reach any nested subtrees within the exported region. This -replicates metadata as well, but it is pushed out by the exporter, -avoiding deadlock with the regular discover and replication process. -The importer is responsible for opening the bounding directories from -any third parties authoritative for those subtrees before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is redelegated for all points nested -beneath the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -A warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If it is not, then this -implies that no metadata within or nested beneath the subtree is -replicated by any node other than the importer an exporter. If it is, -then a MExportWarning message informs any bystanders that the -authority for the region is temporarily ambiguous, and lists both the -exporter and importer as authoritative MDS nodes. In particular, -bystanders who are trimming items from their cache must send -MCacheExpire messages to both the old and new authorities. This is -necessary to ensure that the surviving authority reliably receives all -expirations even if the importer or exporter fails. While the subtree -is frozen (on both the importer and exporter), expirations will not be -immediately processed; instead, they will be queued until the region -is unfrozen and it can be determined that the node is or is not -authoritative. - -The exporter walks the subtree hierarchy and packages up an MExport -message containing all metadata and important state (\eg, information -about metadata replicas). At the same time, the expoter's metadata -objects are flagged as non-authoritative. The MExport message sends -the actual subtree metadata to the importer. Upon receipt, the -importer inserts the data into its cache, marks all objects as -authoritative, and logs a copy of all metadata in an EImportStart -journal message. Once that has safely flushed, it replies with an -MExportAck. The exporter can now log an EExport journal entry, which -ultimately specifies that the export was a success. In the presence -of failures, it is the existence of the EExport entry only that -disambiguates authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged back to the exporter, -implicitly flushing the bystander to exporter message streams of any -stray expiration notices, the exporter unfreezes the subtree, cleans -up its migration-related state, and sends a final MExportFinish to the -importer. Upon receipt, the importer logs an EImportFinish(true) -(noting locally that the export was indeed a success), unfreezes its -subtree, processes any queued cache expierations, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - - -RECOVERY FROM JOURNAL - - - - - - - - - diff --git a/branches/sage/pgs/doc/dentries.txt b/branches/sage/pgs/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/branches/sage/pgs/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/branches/sage/pgs/doc/exports.txt b/branches/sage/pgs/doc/exports.txt deleted file mode 100644 index 8e0e146bea2fe..0000000000000 --- a/branches/sage/pgs/doc/exports.txt +++ /dev/null @@ -1,72 +0,0 @@ - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen (\ie already exporting, or nested beneath -something that is exporting), and the path must be pinned (\ie not -conflicted with a rename). If these conditions are met, the subtree -freeze is initiated, and the exporter is committed to the subtree -migration, barring an intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the base directory -being exported is open on the destination node. It is pinned by the -importer to prevent it from being trimmed. This occurs before the -exporter completes the freeze of the subtree to ensure that the -importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed. - -The MExportPrep message then follows to populate a spanning tree that -includes all dirs, inodes, and dentries necessary to reach any nested -exports within the exported region. This replicates metadata as well, -but it is pushed out by the exporter, avoiding deadlock with the -regular discover and replication process. The importer is responsible -for opening the bounding directories from any third parties before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is delegated for all points nested -within the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -The warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If so, then a -MExportWarning message informs any bystanders that the authority for -the region is temporarily ambiguous. In particular, bystanders who -are trimming items from their cache must send MCacheExpire messages to -both the old and new authorities. This is necessary to ensure that -the surviving authority reliably receives all expirations even if the -importer or exporter fails. While the subtree is frozen (on both the -importer and exporter), expirations will not be immediately processed; -instead, they will be queued until the region is unfrozen and it can -be determined that the node is or is not authoritative for the region. - -The MExport message sends the actual subtree metadata to the importer. -Upon receipt, the importer inserts the data into its cache, logs a -copy in the EImportStart, and replies with an ExportAck. The exporter -can now log an EExportFinish(true), which ultimately specifies that -the export was a success. In the presence of failures, it is the -existence (and value) of the EExportFinish that disambiguates -authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged, implicitly flushing the -bystander to exporter message streams of any stray expiration notices, -the exporter unfreezes the subtree, cleans up its state, and sends a -final MExportFinish to the importer. Upon receipt, the importer logs -an EImportFinish(true), unfreezes its subtree, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - -RECOVERY FROM JOURNAL - - - - - diff --git a/branches/sage/pgs/doc/file_modes.txt b/branches/sage/pgs/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/branches/sage/pgs/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/branches/sage/pgs/doc/header.txt b/branches/sage/pgs/doc/header.txt deleted file mode 100644 index bccdb81533b6f..0000000000000 --- a/branches/sage/pgs/doc/header.txt +++ /dev/null @@ -1,13 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/branches/sage/pgs/doc/inos.txt b/branches/sage/pgs/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/branches/sage/pgs/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/branches/sage/pgs/doc/journal.txt b/branches/sage/pgs/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/branches/sage/pgs/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/branches/sage/pgs/doc/lazy_posix.txt b/branches/sage/pgs/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/branches/sage/pgs/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/branches/sage/pgs/doc/mds_locks.txt b/branches/sage/pgs/doc/mds_locks.txt deleted file mode 100644 index f41a89a9b31e5..0000000000000 --- a/branches/sage/pgs/doc/mds_locks.txt +++ /dev/null @@ -1,66 +0,0 @@ - -new names - dentry_read (not path_pins) - dentry_xlock - - inode_read - inode_xlock (not inode_write) - -locks are always tied to active_requests. - -read locks can be placed on any node. -xlocks must be applied at the authority. - -for multi-lock operations (link, unlink, rename), we must acquire xlocks on a remote node. lock requests are associated with a reqid. the authoritative node keeps track of which remote xlocks it holds. when forwarded/restarted, it can drop remote locks. - -when restarting, drop all locks. -on remote, drop locks and state, and notify main req node. -recover dist request state on rejoin: - - surviving op initiator will assert read or xlock - - recovering op initiator will restart requests. (from initiator's perspective, ops have either happened or they haven't, depending on whether the event is journaled.) - - recovering or surviving op cohort will determine lock state during rejoin, or get a commit or rollback... - - - - ---- path_pin = read lock on /some/random/path - - blocks a dentry xlock - ---- dnxlock = exclusive lock on /some/random/path - - locking: prevents subsequent path pins. - - locked: prevents dn read - - on auth - --> grab _all_ path pins at onces; hold none while waiting. --> grab xlocks in order. - ---- auth_pin = pin to authority, on *dir, *in - - prevents freezing -> frozen. - - freezing blocks new auth pins, thus blocking other local auth_pins. (hangs up local export.) - - does not block remote auth_pins, because remote side is not auth (or frozen!) until after local subtree is frozen. - --> blocking on auth_pins is dangerous. _never_ block if we are holding other auth_pins on the same node (subtree?). --> grab _all_ auth pins at once; hold none while waiting. - ---- hard/file_wrlock = exlusive lock on inode content - - prevents inode read - - on auth - --> grab locks in order. - - -ORDERING -- namespace(dentries) < inodes -- order dentries on (dirino, dname) -- order inodes on (ino); -- need to order both read and write locks, esp with dentries. so, if we need to lock /usr/bin/foo with read on usr and bin and xwrite on foo, we need to acquire all of those locks using the same ordering. - - on same host, we can be 'nice' and check lockability of all items, then lock all, and drop everything while waiting. (actually, is there any use to this?) - - on mutiple hosts, we need to use full ordering (at least as things separate across host boundaries). and if needed lock set changes (such that the order of already acquired locks changes), we need to drop those locks and start over. - -- how do auth pins fit into all this? - - auth pin on xlocks only. no need on read locks. - - pre-grab all auth pins on a node the first time it is visiting during lock acquisition. - - what if things move? if we find we are missing a needed auth pin when we revisit a host at any point, and the item is not still authpinnable, we back off and restart. (we cannot block.) - - - - if we find we are not authpinnable, drop all locks and wait. - - diff --git a/branches/sage/pgs/doc/modeline.txt b/branches/sage/pgs/doc/modeline.txt deleted file mode 100644 index 1b3956f4d486b..0000000000000 --- a/branches/sage/pgs/doc/modeline.txt +++ /dev/null @@ -1,2 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab diff --git a/branches/sage/pgs/doc/osd_outline.txt b/branches/sage/pgs/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/branches/sage/pgs/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/branches/sage/pgs/doc/osd_replication.txt b/branches/sage/pgs/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/branches/sage/pgs/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/branches/sage/pgs/doc/shared_write_states_nogo.txt b/branches/sage/pgs/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/branches/sage/pgs/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/branches/sage/pgs/doc/shutdown.txt b/branches/sage/pgs/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/branches/sage/pgs/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/branches/sage/pgs/ebofs/Allocator.cc b/branches/sage/pgs/ebofs/Allocator.cc deleted file mode 100644 index 70b641cfee14f..0000000000000 --- a/branches/sage/pgs/ebofs/Allocator.cc +++ /dev/null @@ -1,693 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //cout << " empty" << endl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << endl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << endl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << endl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << endl; - dout(10) << "limbo is " << limbo << endl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << endl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << endl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << endl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << endl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << endl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << endl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << endl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << endl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << endl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << endl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << endl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << endl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << endl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << endl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << endl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << endl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << endl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << endl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << endl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << endl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << endl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/branches/sage/pgs/ebofs/Allocator.h b/branches/sage/pgs/ebofs/Allocator.h deleted file mode 100644 index a7d7aebf75d00..0000000000000 --- a/branches/sage/pgs/ebofs/Allocator.h +++ /dev/null @@ -1,86 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - /*int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - */ - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/branches/sage/pgs/ebofs/BlockDevice.cc b/branches/sage/pgs/ebofs/BlockDevice.cc deleted file mode 100644 index 6227ac574da67..0000000000000 --- a/branches/sage/pgs/ebofs/BlockDevice.cc +++ /dev/null @@ -1,780 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl; - - // find our position: i >= pos - map::iterator i; - - int tries = g_conf.bdev_el_bidir + 1; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << endl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << endl; - } else { - // dout(20) << "restart fowrard sweep" << endl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << endl; - } else { - // dout(20) << "restart reverse sweep" << endl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << endl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - // dout(20) << "dequeue_io " << bio->start << "~" << bio->length - // << " intersects block_lock " << block_lock << endl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_back(bio); // add at back - } else { - // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << endl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << endl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - -#ifdef BLKGETSIZE64 - // ioctl block device? - ioctl(fd, BLKGETSIZE64, &num_blocks); -#endif - - if (!num_blocks) { - // hmm, try stat! - struct stat st; - fstat(fd, &st); - num_blocks = st.st_size; - } - - num_blocks /= (uint64_t)EBOFS_BLOCK_SIZE; - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - bool do_sleep = false; - - // queue empty? - if (root_queue.empty()) { - // sleep - do_sleep = true; - } else { - dout(20) << "io_thread" << whoami << " going" << endl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - do_sleep = true; - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - } - } - - if (do_sleep) { - do_sleep = false; - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running," - << " queue has " << root_queue.size() << endl; - - if (g_conf.bdev_idle_kick_after_ms > 0 && - io_threads_running == 0 && - idle_kicker) { - // first wait for signal | timeout - io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - - // should we still be sleeping? (did we get woken up, or did timer expire? - if (root_queue.empty() && io_threads_running == 0) { - idle_kicker->kick(); // kick - io_wakeup.Wait(lock); // and wait - } - } else { - // normal, just wait. - io_wakeup.Wait(lock); - } - - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl; - } - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << endl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << endl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << endl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - lock.Lock(); // hmm? - bio->cond->Signal(); - lock.Unlock(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << endl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << endl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - /* - if (io_threads_running == 0 && idle_kicker) { - complete_lock.Unlock(); - idle_kicker->kick(); - complete_lock.Lock(); - if (!complete_queue.empty() || io_stop) - continue; - } - */ - - dout(25) << "complete_thread sleeping" << endl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << endl; - complete_lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << endl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << endl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << endl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << endl; - dout(1) << "bl is " << bl << endl; - assert(0); - } else { - assert(r == (int)len); - } - - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << endl; - assert(0); - return -1; - } - } - - // figure size - uint64_t bsize = get_num_blocks(); - - dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << endl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - complete_lock.Unlock(); - lock.Unlock(); - - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << endl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/branches/sage/pgs/ebofs/BlockDevice.h b/branches/sage/pgs/ebofs/BlockDevice.h deleted file mode 100644 index 3f44699b1673b..0000000000000 --- a/branches/sage/pgs/ebofs/BlockDevice.h +++ /dev/null @@ -1,339 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - kicker *idle_kicker; // not used.. - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - idle_kicker(0), - io_stop(false), io_threads_started(0), io_threads_running(0), - complete_queue_len(0), - complete_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/branches/sage/pgs/ebofs/BufferCache.cc b/branches/sage/pgs/ebofs/BufferCache.cc deleted file mode 100644 index a83ce5cb480fd..0000000000000 --- a/branches/sage/pgs/ebofs/BufferCache.cc +++ /dev/null @@ -1,1148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << endl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << endl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << endl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << endl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << endl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << endl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << endl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << endl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << endl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(-20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << endl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << endl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch) -{ - map::iterator p = data.lower_bound(start); - - dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - bool newalloc = false; - - // based on alloc/no-alloc boundary ... - if (alloc.contains(cur, left)) { - if (alloc.contains(cur)) { - block_t ends = alloc.end_after(cur); - max = MIN(left, ends-cur); - newalloc = true; - } else { - if (alloc.starts_after(cur)) { - block_t st = alloc.start_after(cur); - max = MIN(left, st-cur); - } - } - } - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - if (newalloc) { - dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl; - } else { - dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl; - } - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << endl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << endl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << endl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << endl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl; - } - } -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << endl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << endl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << endl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << endl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << endl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << endl; - bh->tx_ioh = 0; - mark_dirty(bh); - - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << endl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << endl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << endl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << endl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << endl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << endl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].erase(bh); -} diff --git a/branches/sage/pgs/ebofs/BufferCache.h b/branches/sage/pgs/ebofs/BufferCache.h deleted file mode 100644 index 6e5277b13c1ec..0000000000000 --- a/branches/sage/pgs/ebofs/BufferCache.h +++ /dev/null @@ -1,710 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - map::iterator i = partial.begin(); - while (i != partial.end()) { - // is [off,off+len)... - // past i? - if (off >= i->first + i->second.length()) { - i++; - continue; - } - // before i? - if (i->first >= off+len) break; - - // does [off,off+len)... - // overlap all of i? - if (off <= i->first && off+len >= i->first + i->second.length()) { - // erase it and move on. - partial.erase(i++); - continue; - } - // overlap tail of i? - if (off > i->first && off < i->first + i->second.length()) { - // shorten i. - bufferlist o; - o.claim( i->second ); - unsigned taillen = off - i->first; - i->second.substr_of(o, 0, taillen); - i++; - continue; - } - // overlap head of i? - if (off < i->first && off+len < i->first + i->second.length()) { - // move i (make new tail). - off_t tailoff = off+len; - unsigned trim = tailoff - i->first; - partial[tailoff].substr_of(i->second, trim, i->second.length()-trim); - partial.erase(i++); // should now be at tailoff - i++; - continue; - } - // split i? - if (off > i->first && off+len < i->first + i->second.length()) { - bufferlist o; - o.claim( i->second ); - // shorten head - unsigned headlen = off - i->first; - i->second.substr_of(o, 0, headlen); - // new tail - unsigned tailoff = off+len - i->first; - unsigned taillen = o.length() - len - headlen; - partial[off+len].substr_of(o, tailoff, taillen); - break; - } - assert(0); - } - - // insert - partial[off] = p; - } - - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - set dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - dirty_bh.insert(bh); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/branches/sage/pgs/ebofs/Cnode.h b/branches/sage/pgs/ebofs/Cnode.h deleted file mode 100644 index 8415978893fb5..0000000000000 --- a/branches/sage/pgs/ebofs/Cnode.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/branches/sage/pgs/ebofs/Ebofs.cc b/branches/sage/pgs/ebofs/Ebofs.cc deleted file mode 100644 index 0e3b1df8ff381..0000000000000 --- a/branches/sage/pgs/ebofs/Ebofs.cc +++ /dev/null @@ -1,3458 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include "FileJournal.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - // open dev - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - dout(3) << "mount super @0 epoch " << sb1->epoch << endl; - dout(3) << "mount super @1 epoch " << sb2->epoch << endl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << endl; - assert(super_epoch == sb->epoch); - - super_fsid = sb->fsid; - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << endl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap( dev, super_epoch ); - nodepool.read_clean_nodes( dev ); - - // open tables - dout(3) << "mount opening tables" << endl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - allocator.release_limbo(); - - - // open journal? - if (journalfn) { - journal = new FileJournal(this, journalfn); - if (journal->open() < 0) { - dout(-3) << "mount journal " << journalfn << " open failed" << endl; - delete journal; - journal = 0; - } else { - dout(-3) << "mount journal " << journalfn << " opened, replaying" << endl; - - while (1) { - bufferlist bl; - epoch_t e; - if (!journal->read_entry(bl, e)) { - dout(-3) << "mount replay: end of journal, done." << endl; - break; - } - - if (e < super_epoch) { - dout(-3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << endl; - continue; - } - if (e == super_epoch+1) { - super_epoch++; - dout(-3) << "mount replay: jumped to next epoch " << super_epoch << endl; - } - assert(e == super_epoch); - - dout(-3) << "mount replay: applying transaction in epoch " << e << endl; - Transaction t; - int off = 0; - t._decode(bl, off); - _apply_transaction(t); - } - } - } - - dout(3) << "mount starting commit+finisher threads" << endl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - mounted = true; - - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - // make a super-random fsid - srand(time(0) ^ getpid()); - super_fsid = (lrand48() << 32) ^ mrand48(); - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << endl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl; - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << endl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl; - - nodepool.commit_start( dev, 0 ); - nodepool.commit_wait(); - bufferptr superbp0; - prepare_super(0, superbp0); - write_super(0, superbp0); - - nodepool.commit_start( dev, 1 ); - nodepool.commit_wait(); - bufferptr superbp1; - prepare_super(1, superbp1); - write_super(1, superbp1); - - // free memory - dout(10) << "mkfs: cleaning up" << endl; - close_tables(); - - dev.close(); - - - // create journal? - if (journalfn) { - journal = new FileJournal(this, journalfn); - if (journal->create() < 0) { - dout(3) << "mount journal " << journalfn << " created failed" << endl; - delete journal; - } else { - dout(3) << "mount journal " << journalfn << " created" << endl; - } - } - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; i::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << endl; - } - - // free memory - dout(5) << "umount cleaning up" << endl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(1) << "umount done on " << dev.get_device_name() << endl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << endl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.fsid = super_fsid; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // periodically check for idle block device - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl; - long left = g_conf.ebofs_commit_ms; - while (left > 0) { - long next = MIN(left, g_conf.ebofs_idle_commit_ms); - if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT) - break; // we got kicked - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << endl; - break; // dev is idle - } - left -= next; - dout(20) << "commit_thread " << left << " ms left" << endl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << endl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << endl; - } - else { - super_epoch++; - dirty = false; - - dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << endl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.num_used()/nodepool.num_total() << "% used, " - << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " - << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " - << nodepool.num_total() << " total." << endl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - if (journal) journal->commit_epoch_start(); - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << endl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << endl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << endl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << endl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << endl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.num_free() < nodepool.num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << endl; - alloc_more_node_space(); - } - - // signal journal - if (journal) journal->commit_epoch_finish(); - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl; - - queue_finishers(commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << endl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << endl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << endl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << endl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << endl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << endl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << endl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //cout << "get_onode " << *on << endl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << endl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl; - cerr << " onode_loc is " << eo->onode_loc << endl; - cerr << " object_size " << eo->object_size << endl; - cerr << " object_blocks " << eo->object_blocks << endl; - cerr << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << endl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //cout << "get_onode " << *on << " (loaded)" << endl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << endl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << endl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << endl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //cout << "put_onode " << *on << endl; - - if (on->get_ref_count() == 0 && on->dangling) { - //cout << " *** hosing on " << *on << endl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << endl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << endl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << endl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << endl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << endl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << endl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << endl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << endl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << endl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << endl; - commit_cond.Signal(); - - /* - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(0) << "kick_idle dirty, doing commit" << endl; - commit_cond.Signal(); - } else { - dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl; - } - ebofs_lock.Unlock(); - */ -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - - while (1) { - if (journal) { - // journal empty transaction - Transaction t; - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - commit_waiters[super_epoch].push_back(onsafe); - break; - } - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << endl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << endl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << endl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << endl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << endl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << endl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << endl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << endl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << endl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << endl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - cout << " ta " << ta << endl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - cout << " tb " << tb << endl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << endl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } - if (zleft) - dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, alloc, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << endl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << endl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - // alloc new buffers. - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - // copy! - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << endl; - - unsigned z = MIN(len_in_bh, zleft); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << endl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << endl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << endl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << endl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << endl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl; - - unsigned r = _apply_transaction(t); - - // journal, wait for commit - if (r != 0 && onsafe) { - delete onsafe; // kill callback, but still journal below (in case transaction had side effects) - onsafe = 0; - } - while (1) { - if (journal) { - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - - ebofs_lock.Unlock(); - return r; -} - -unsigned Ebofs::_apply_transaction(Transaction& t) -{ - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << endl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << endl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << endl; - r &= bit; - } - } - break; - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "_apply_transaction finish (r = " << r << ")" << endl; - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, const bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << endl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -int Ebofs::write(object_t oid, - off_t off, size_t len, - const bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - assert(len > 0); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - while (1) { - if (journal) { - Transaction t; - t.write(oid, off, len, bl); - bufferlist tbl; - t._encode(tbl); - if (journal->submit_entry(tbl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << endl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove(oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << endl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << endl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.truncate(oid, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.clone(from, to); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << endl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << endl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << endl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattr(oid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattrs(oid, attrset); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.rmattr(oid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << endl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << endl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.create_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << endl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_add(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_remove(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << endl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - int r = _collection_setattr(cid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_setattr(cid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_rmattr(cid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << endl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/branches/sage/pgs/ebofs/Ebofs.h b/branches/sage/pgs/ebofs/Ebofs.h deleted file mode 100644 index 4d95a71f77e4e..0000000000000 --- a/branches/sage/pgs/ebofs/Ebofs.h +++ /dev/null @@ -1,360 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" -#include "Journal.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { -protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** -public: - BlockDevice dev; -protected: - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - uint64_t super_fsid; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - -public: - uint64_t get_fsid() { return super_fsid; } - epoch_t get_super_epoch() { return super_epoch; } -protected: - - - // ** journal ** - char *journalfn; - Journal *journal; - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - -public: - void queue_finisher(Context *c) { - finisher_lock.Lock(); - finisher_queue.push_back(c); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } - void queue_finishers(list& ls) { - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), ls); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } -protected: - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn, char *jfn=0) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - journalfn(jfn), journal(0), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_getattr(coll_t oid, const char *name, void *value, size_t size); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - unsigned _apply_transaction(Transaction& t); - - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, const bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/branches/sage/pgs/ebofs/FileJournal.cc b/branches/sage/pgs/ebofs/FileJournal.cc deleted file mode 100644 index 40a73a442182d..0000000000000 --- a/branches/sage/pgs/ebofs/FileJournal.cc +++ /dev/null @@ -1,456 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "FileJournal.h" -#include "Ebofs.h" - -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << ebofs->dev.get_device_name() << ").journal " -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << ebofs->dev.get_device_name() << ").journal " - - -int FileJournal::create() -{ - dout(1) << "create " << fn << endl; - - // open/create - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(1) << "create failed " << errno << " " << strerror(errno) << endl; - return -errno; - } - assert(fd > 0); - - //::ftruncate(fd, 0); - //::fchmod(fd, 0644); - - // get size - struct stat st; - ::fstat(fd, &st); - dout(1) << "open " << fn << " " << st.st_size << " bytes" << endl; - - // write empty header - memset(&header, 0, sizeof(header)); - header.clear(); - header.fsid = ebofs->get_fsid(); - header.max_size = st.st_size; - write_header(); - - // writeable. - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - ::close(fd); - - return 0; -} - -int FileJournal::open() -{ - //dout(1) << "open " << fn << endl; - - // open and file - assert(fd == 0); - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(1) << "open failed " << errno << " " << strerror(errno) << endl; - return -errno; - } - assert(fd > 0); - - // assume writeable, unless... - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - // read header? - read_header(); - if (header.num > 0 && header.fsid == ebofs->get_fsid()) { - // valid header, pick an offset - for (int i=0; iget_super_epoch()) { - dout(2) << "using read_pos header pointer " - << header.epoch[i] << " at " << header.offset[i] - << endl; - read_pos = header.offset[i]; - write_pos = queue_pos = 0; - break; - } - else if (header.epoch[i] < ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", skipping old " << header.epoch[i] << " at " << header.offset[i] - << endl; - } - else if (header.epoch[i] > ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i] - << endl; - break; - } - } - } - - start_writer(); - - return 0; -} - -void FileJournal::close() -{ - dout(1) << "close " << fn << endl; - - // stop writer thread - stop_writer(); - - // close - assert(writeq.empty()); - assert(commitq.empty()); - assert(fd > 0); - ::close(fd); - fd = 0; -} - -void FileJournal::start_writer() -{ - write_stop = false; - write_thread.create(); -} - -void FileJournal::stop_writer() -{ - write_lock.Lock(); - { - write_stop = true; - write_cond.Signal(); - } - write_lock.Unlock(); - write_thread.join(); -} - - -void FileJournal::print_header() -{ - for (int i=0; i::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - ::write(fd, (char*)(*it).c_str(), (*it).length() ); - } - - ::write(fd, &h, sizeof(h)); - - // move position pointer - write_pos += 2*sizeof(entry_header_t) + bl.length(); - - if (oncommit) { - if (1) { - // queue callback - ebofs->queue_finisher(oncommit); - } else { - // callback now - oncommit->finish(0); - delete oncommit; - } - } - } - } - - write_lock.Unlock(); - dout(10) << "write_thread_entry finish" << endl; -} - -bool FileJournal::submit_entry(bufferlist& e, Context *oncommit) -{ - assert(queue_pos != 0); // bad create(), or journal didn't replay to completion. - - // ** lock ** - Mutex::Locker locker(write_lock); - - // wrap? full? - off_t size = 2*sizeof(entry_header_t) + e.length(); - - if (full) return false; // already marked full. - - if (header.wrap) { - // we're wrapped. don't overwrite ourselves. - if (queue_pos + size >= header.offset[0]) { - dout(10) << "submit_entry JOURNAL FULL (and wrapped), " << queue_pos << "+" << size - << " >= " << header.offset[0] - << endl; - full = true; - print_header(); - return false; - } - } else { - // we haven't wrapped. - if (queue_pos + size >= header.max_size) { - // is there room if we wrap? - if ((off_t)sizeof(header_t) + size < header.offset[0]) { - // yes! - dout(10) << "submit_entry wrapped from " << queue_pos << " to " << sizeof(header_t) << endl; - header.wrap = queue_pos; - queue_pos = sizeof(header_t); - header.push(ebofs->get_super_epoch(), queue_pos); - } else { - // no room. - dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << queue_pos << "+" << size - << " >= " << header.max_size - << endl; - full = true; - return false; - } - } - } - - dout(10) << "submit_entry " << queue_pos << " : " << e.length() - << " epoch " << ebofs->get_super_epoch() - << " " << oncommit << endl; - - // dump on queue - writeq.push_back(pair(ebofs->get_super_epoch(), e)); - commitq.push_back(oncommit); - - queue_pos += size; - - // kick writer thread - write_cond.Signal(); - - return true; -} - - -void FileJournal::commit_epoch_start() -{ - dout(10) << "commit_epoch_start on " << ebofs->get_super_epoch()-1 - << " -- new epoch " << ebofs->get_super_epoch() - << endl; - - Mutex::Locker locker(write_lock); - - // was full -> empty -> now usable? - if (full) { - if (header.num != 0) { - dout(1) << " journal FULL, ignoring this epoch" << endl; - return; - } - - dout(1) << " clearing FULL flag, journal now usable" << endl; - full = false; - } - - // note epoch boundary - header.push(ebofs->get_super_epoch(), queue_pos); // note: these entries may not yet be written. - //write_header(); // no need to write it now, though... -} - -void FileJournal::commit_epoch_finish() -{ - dout(10) << "commit_epoch_finish committed " << ebofs->get_super_epoch()-1 << endl; - - write_lock.Lock(); - { - if (full) { - // full journal damage control. - dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << endl; - header.clear(); - write_pos = queue_pos = sizeof(header_t); - } else { - // update header -- trim/discard old (committed) epochs - while (header.epoch[0] < ebofs->get_super_epoch()) - header.pop(); - } - write_header(); - - // discard any unwritten items in previous epoch, and do callbacks - epoch_t epoch = ebofs->get_super_epoch(); - list callbacks; - while (!writeq.empty() && writeq.front().first < epoch) { - dout(15) << " dropping unwritten and committed " - << write_pos << " : " << writeq.front().second.length() - << " epoch " << writeq.front().first - << endl; - // finisher? - Context *oncommit = commitq.front(); - if (oncommit) callbacks.push_back(oncommit); - - write_pos += 2*sizeof(entry_header_t) + writeq.front().second.length(); - - // discard. - writeq.pop_front(); - commitq.pop_front(); - } - - // queue the finishers - ebofs->queue_finishers(callbacks); - } - write_lock.Unlock(); - -} - - -void FileJournal::make_writeable() -{ - if (read_pos) - write_pos = queue_pos = read_pos; - else - write_pos = queue_pos = sizeof(header_t); - read_pos = 0; -} - - -bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) -{ - if (!read_pos) { - dout(1) << "read_entry -- not readable" << endl; - make_writeable(); - return false; - } - - if (read_pos == header.wrap) { - // find wrap point - for (int i=1; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_FILEJOURNAL_H -#define __EBOFS_FILEJOURNAL_H - - -#include "Journal.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" - -class FileJournal : public Journal { -public: - /** log header - * we allow 3 pointers: - * top/initial, - * one for an epoch boundary, - * and one for a wrap in the ring buffer/journal file. - * the epoch boundary one is useful only for speedier recovery in certain cases - * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!) - */ - struct header_t { - uint64_t fsid; - int num; - off_t wrap; - off_t max_size; - epoch_t epoch[3]; - off_t offset[3]; - - header_t() : fsid(0), num(0), wrap(0), max_size(0) {} - - void clear() { - num = 0; - wrap = 0; - } - void pop() { - if (num >= 2 && offset[0] > offset[1]) - wrap = 0; // we're eliminating a wrap - num--; - for (int i=0; i > writeq; // currently journaling - list commitq; // currently journaling - - // write thread - Mutex write_lock; - Cond write_cond; - bool write_stop; - - void print_header(); - void read_header(); - void write_header(); - void start_writer(); - void stop_writer(); - void write_thread_entry(); - - void make_writeable(); - - class Writer : public Thread { - FileJournal *journal; - public: - Writer(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_thread_entry(); - return 0; - } - } write_thread; - - public: - FileJournal(Ebofs *e, char *f) : - Journal(e), fn(f), - full(false), - write_pos(0), queue_pos(0), read_pos(0), - fd(0), - write_stop(false), write_thread(this) { } - ~FileJournal() {} - - int create(); - int open(); - void close(); - - // writes - bool submit_entry(bufferlist& e, Context *oncommit); // submit an item - void commit_epoch_start(); // mark epoch boundary - void commit_epoch_finish(); // mark prior epoch as committed (we can expire) - - bool read_entry(bufferlist& bl, epoch_t& e); - - // reads -}; - -#endif diff --git a/branches/sage/pgs/ebofs/Journal.h b/branches/sage/pgs/ebofs/Journal.h deleted file mode 100644 index fb1983c22eafc..0000000000000 --- a/branches/sage/pgs/ebofs/Journal.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_JOURNAL_H -#define __EBOFS_JOURNAL_H - -class Ebofs; - -#include "include/buffer.h" -#include "include/Context.h" - -class Journal { -protected: - Ebofs *ebofs; - -public: - Journal(Ebofs *e) : ebofs(e) { } - virtual ~Journal() { } - - virtual int create() = 0; - virtual int open() = 0; - virtual void close() = 0; - - // writes - virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item - virtual void commit_epoch_start() = 0; // mark epoch boundary - virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire) - virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0; - - // reads/recovery - -}; - -#endif diff --git a/branches/sage/pgs/ebofs/Onode.h b/branches/sage/pgs/ebofs/Onode.h deleted file mode 100644 index 356796063b06f..0000000000000 --- a/branches/sage/pgs/ebofs/Onode.h +++ /dev/null @@ -1,391 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p = extent_map.lower_bound(start); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/branches/sage/pgs/ebofs/Table.h b/branches/sage/pgs/ebofs/Table.h deleted file mode 100644 index a3a084a46315a..0000000000000 --- a/branches/sage/pgs/ebofs/Table.h +++ /dev/null @@ -1,899 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ip; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t node) { - assert(is_index()); - for (int i=size(); i>p; i--) - index_item(i) = index_item(i-1); - index_item(p).key = key; - index_item(p).node = node; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << endl; - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode( pool.get_node(root) ); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - dbtout << "insert " << key << " -> " << value << endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - dbtout << "remove " << key << endl; - - if (almost_full()) { - cout << "table almost full, failing" << endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left = pool.get_node( left_loc ); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right = pool.get_node( right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << endl; - - Nodeptr node = pool.get_node( node_loc ); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << endl; - err++; - } - } - } - - //return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } - - if (0) { - for (int i=0; i " << node.leaf_item(i).value << dec << endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/branches/sage/pgs/ebofs/mkfs.ebofs.cc b/branches/sage/pgs/ebofs/mkfs.ebofs.cc deleted file mode 100644 index 1b432dd12da66..0000000000000 --- a/branches/sage/pgs/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" - - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> tx - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_TX = 3; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int state; // use bit fields above! - - bufferptr bptr; - bufferptr shadow_bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_tx() { return state == STATE_TX; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - - void make_shadow() { - assert(is_tx()); - - shadow_bptr = bptr; - - // new buffer - bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES); - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - - // copy contents! - memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); - } - -}; - - - - - -class NodePool { - protected: - map node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - protected: - // on-disk block states - int num_nodes; - set free; - set dirty; - set tx; - set clean; // aka used - set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - static int make_nodeid(int region, int offset) { - return (region << 24) | offset; - } - static int nodeid_region(nodeid_t nid) { - return nid >> 24; - } - static int nodeid_offset(nodeid_t nid) { - return nid & ((1 << 24) - 1); - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int num_free() { return free.size(); } - int num_dirty() { return dirty.size(); } - int num_limbo() { return limbo.size(); } - int num_tx() { return tx.size(); } - int num_clean() { return clean.size(); } - int num_total() { return num_nodes; } - int num_used() { return num_clean() + num_dirty() + num_tx(); } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - int num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - int region = region_loc.size(); - assert(ex.length <= (1 << 24)); - region_loc.push_back(ex); - for (unsigned o = 0; o < ex.length; o++) { - free.insert( make_nodeid(region, o) ); - } - num_nodes += ex.length; - } - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << endl; - debofs(3) << "init odd map at " << usemap_odd << endl; - - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - free.clear(); - dirty.clear(); - tx.clear(); - clean.clear(); - limbo.clear(); - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - dev.read(loc.start, loc.length, bp); - - // parse - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; // move one bit right. - roff++; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - if (region == region_loc.size()) break; - } - return 0; - } - - int read_clean_nodes(BlockDevice& dev) { - /* - this relies on the clean set begin defined so that we know which nodes - to read. so it only really works when called from mount()! - */ - for (unsigned r=0; rflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - - // fill in - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - - *(unsigned char*)(bp.c_str() + byte) = x; - if (region == region_loc.size()) break; - } - - - // write - bufferlist bl; - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - - // mark nid clean|limbo - if (tx.count(nid)) { // tx -> clean - tx.erase(nid); - clean.insert(nid); - - // make node itself clean - node_map[nid]->set_state(Node::STATE_CLEAN); - } - else { // already limbo (was dirtied, or released) - assert(limbo.count(nid)); - } - - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - dout(20) << "ebofs.nodepool.commit_start start" << endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i tx (write to disk) - assert(tx.empty()); - set didb; - for (set::iterator i = dirty.begin(); - i != dirty.end(); - i++) { - Node *n = get_node(*i); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_TX); - - unsigned region = nodeid_region(*i); - block_t off = nodeid_offset(*i); - block_t b = region_loc[region].start + off; - - if (0) { // sanity check debug FIXME - assert(didb.count(b) == 0); - didb.insert(b); - } - - bufferlist bl; - bl.append(n->get_buffer()); - dev.write(b, EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, *i), "node"); - flushing++; - - tx.insert(*i); - } - dirty.clear(); - - // limbo -> free - for (set::iterator i = limbo.begin(); - i != limbo.end(); - i++) { - free.insert(*i); - } - limbo.clear(); - - dout(20) << "ebofs.nodepool.commit_start finish" << endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - dout(20) << "ebofs.nodepool.commit_wait finish" << endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // unopened node - /* not implemented yet!! - Node* open_node(nodeid_t nid) { - Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); - dbtout << "pool.open_node " << n->get_id() << endl; - node_map[n->get_id()] = n; - return n; - } - */ - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = *(free.begin()); - free.erase(nid); - dirty.insert(nid); - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - Node *n = new Node(nid, bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << endl; - node_map.erase(nid); - - if (n->is_dirty()) { - assert(dirty.count(nid)); - dirty.erase(nid); - free.insert(nid); - } else if (n->is_clean()) { - assert(clean.count(nid)); - clean.erase(nid); - limbo.insert(nid); - } else if (n->is_tx()) { - assert(tx.count(nid)); // i guess htis happens? -sage - tx.erase(nid); - limbo.insert(nid); - } - - delete n; - } - - void release_all() { - while (!node_map.empty()) { - map::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl; - - // release old block - if (n->is_clean()) { - assert(clean.count(oldid)); - clean.erase(oldid); - } else { - assert(n->is_tx()); - assert(tx.count(oldid)); - tx.erase(oldid); - - // move/copy current -> shadow buffer as necessary - n->make_shadow(); - } - limbo.insert(oldid); - node_map.erase(oldid); - - n->set_state(Node::STATE_DIRTY); - - // move to new one! - n->set_id(newid); - node_map[newid] = n; - } - - - -}; - -#endif diff --git a/branches/sage/pgs/ebofs/test.ebofs.cc b/branches/sage/pgs/ebofs/test.ebofs.cc deleted file mode 100644 index 345f49b7a68ca..0000000000000 --- a/branches/sage/pgs/ebofs/test.ebofs.cc +++ /dev/null @@ -1,226 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - if (!threads) threads = 1; - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (0) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << endl; - } - - cout << "stopping" << endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/branches/sage/pgs/ebofs/types.h b/branches/sage/pgs/ebofs/types.h deleted file mode 100644 index 1fa209a3deeb9..0000000000000 --- a/branches/sage/pgs/ebofs/types.h +++ /dev/null @@ -1,170 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -typedef int nodeid_t; - -static const int EBOFS_NODE_BLOCKS = 1; -static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - uint64_t s_magic; - uint64_t fsid; - - epoch_t epoch; // version of this superblock. - - uint64_t num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - uint64_t free_blocks; /* unused blocks */ - uint64_t limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/branches/sage/pgs/fakefuse.cc b/branches/sage/pgs/fakefuse.cc deleted file mode 100644 index 66e5d550c1543..0000000000000 --- a/branches/sage/pgs/fakefuse.cc +++ /dev/null @@ -1,157 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << endl; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; iinit(); - } - for (int i=0; iinit(); - } - - for (int i=0; iinit(); - } - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "starting fuse on pid " << getpid() << endl; - client[i]->mount(); - - char *oldcwd = get_current_dir_name(); // note previous wd - ceph_fuse_main(client[i], argc, argv); - ::chdir(oldcwd); // return to previous wd - - client[i]->unmount(); - cout << "fuse finished on pid " << getpid() << endl; - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << endl; - - //cerr << "inode_t " << sizeof(inode_t) << endl; - - vector args; - argv_to_vec(argc, argv, args); - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[i] = entity_inst_t(MSG_ADDR_MON(i), a); // hack ; see FakeMessenger.cc - } - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - for (int i=0; iinit(); - } - - - // create client(s) - for (int i=0; iinit(); - - // use my argc, argv (make sure you pass a mount point!) - //cout << "mounting" << endl; - client[i]->mount(); - - //cout << "starting synthetic client " << endl; - syn[i] = new SyntheticClient(client[i]); - - syn[i]->start_thread(); - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "unmounted" << endl; - client[i]->shutdown(); - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - list ls; - if (finished.empty()) return; - - ls.swap(finished); // swap out of place to avoid weird loops - - dout(10) << ls.size() << " contexts to finish with " << result << endl; - for (std::list::iterator it = ls.begin(); - it != ls.end(); - it++) { - Context *c = *it; - dout(10) << "---- " << c << endl; - c->finish(result); - delete c; - } -} - -class C_NoopContext : public Context { -public: - void finish(int r) { } -}; - - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { - std::list clist; - -public: - void add(Context* c) { - clist.push_back(c); - } - void take(std::list& ls) { - clist.splice(clist.end(), ls); - } - void finish(int r) { - finish_contexts(clist, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f=0) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - - void set_finisher(Context *c) { - assert(!onfinish); - onfinish = c; - } - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - - bool empty() { return num == 0; } - int get_num() { return num; } - - void finish(int r) { - assert(0); // nobody should ever call me. - } - -}; - -#endif diff --git a/branches/sage/pgs/include/Distribution.h b/branches/sage/pgs/include/Distribution.h deleted file mode 100644 index efc0795a72fcb..0000000000000 --- a/branches/sage/pgs/include/Distribution.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLOBHASH_H -#define __BLOBHASH_H - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static hash H; - long acc = 0; - while (len >= sizeof(long)) { - acc ^= *(long*)p; - p += sizeof(long); - len -= sizeof(long); - } - int sh = 0; - while (len) { - acc ^= (long)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -#endif diff --git a/branches/sage/pgs/include/buffer.h b/branches/sage/pgs/include/buffer.h deleted file mode 100644 index b3b37a7c1fb72..0000000000000 --- a/branches/sage/pgs/include/buffer.h +++ /dev/null @@ -1,982 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include - -#include "common/Mutex.h" - -#include -#include - -using std::cout; -using std::endl; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // fixme. - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; - int nref; - Mutex lock; // we'll make it non-recursive. - - raw(unsigned l) : len(l), nref(0), lock(false) {} - raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {} - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+4095]; - unsigned off = ((unsigned)realdata) % 4096; - if (off) - data = realdata + 4096 - off; - else - data = realdata; - inc_total_alloc(len+4095); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << endl; - assert(((unsigned)data & 4095) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+4095); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - return new raw_mmap_pages(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - ++r->nref; - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - ++_raw->nref; - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - ++_raw->nref; - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { - p._raw->lock.Lock(); - ++p._raw->nref; // inc new - p._raw->lock.Unlock(); - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - void release() { - if (_raw) { - _raw->lock.Lock(); - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; - _raw->lock.Unlock(); - delete _raw; // dealloc old (if any) - } else - _raw->lock.Unlock(); - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - // accessors - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned start() const { return _off; } - unsigned end() const { return _off + _len; } - unsigned unused_tail_length() const { - if (_raw) - return _raw->len - (_off+_len); - else - return 0; - } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref; } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - std::list _buffers; - unsigned _len; - ptr append_buffer; // where i put small appends. - - public: - // cons/des - list() : _len(0) {} - list(const list& other) : _buffers(other._buffers), _len(other._len) { } - list(unsigned l) : _len(0) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - unsigned length() const { -#if 1 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, const list& bl) { - unsigned left = len; - for (std::list::const_iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - while (len > 0) { - // put what we can into the existing append_buffer. - if (append_buffer.unused_tail_length() > 0) { - unsigned gap = append_buffer.unused_tail_length(); - if (gap > len) gap = len; - append_buffer.append(data, gap); - append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list - len -= gap; - data += gap; - } - if (len == 0) break; // done! - - // make a new append_buffer! - unsigned alen = BUFFER_PAGE_SIZE * (((len-1) / BUFFER_PAGE_SIZE) + 1); - append_buffer = create_page_aligned(alen); - append_buffer.set_length(0); // unused, so far. - } - } - void append(ptr& bp) { - push_back(bp); - } - void append(ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - ptr tempbp(bp, off, len); - push_back(tempbp); - } - void append(const list& bl) { - _len += bl._len; - for (std::list::const_iterator p = bl._buffers.begin(); - p != bl._buffers.end(); - ++p) - _buffers.push_back(*p); - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(const list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::const_iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// ---------------------------------------------------------- -// new encoders - -// raw -template -inline void _encoderaw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decoderaw(T& t, bufferlist& bl, int& off) -{ - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode(const std::list& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::list& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// deque -template -inline void _encode(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::deque& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// set -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encoderaw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - s.clear(); - while (n--) { - T v; - _decode(v, bl, off); - s.insert(v); - } -} - -// vector -template -inline void _encode(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encoderaw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::vector& v, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - v.resize(n); - for (uint32_t i=0; i -inline void _encode(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(std::map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// hash_map -template -inline void _encode(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s.c_str(), len+1); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s = bl.c_str() + off; // FIXME someday to avoid a huge buffer copy? - off += len+1; -} - -// const char* (encode only, string compatible) -inline void _encode(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encoderaw(len, bl); - bl.append(s, len+1); -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encoderaw(len, bl); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.substr_of(bl, off, len); - off += len; -} - -// base -template -inline void _encode(const T& t, bufferlist& bl) -{ - _encoderaw(t, bl); -} -template -inline void _decode(T& t, bufferlist& bl, int& off) -{ - _decoderaw(t, bl, off); -} - - - -#endif diff --git a/branches/sage/pgs/include/encodable.h b/branches/sage/pgs/include/encodable.h deleted file mode 100644 index 5d53c80adbda0..0000000000000 --- a/branches/sage/pgs/include/encodable.h +++ /dev/null @@ -1,172 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ENCODABLE_H -#define __ENCODABLE_H - -#include "buffer.h" - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode_complex(const std::list& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::list& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, bl, off); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_complex(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::deque& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, bl, off); - ls.push_back(v); - } -} - -// set -template -inline void _encode_complex(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encoderaw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::set& s, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - s.clear(); - while (n--) { - T v; - _decode_complex(v, bl, off); - s.insert(v); - } -} - -// vector -template -inline void _encode_complex(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encoderaw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::vector& v, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_complex(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(std::map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode_complex(m[k], bl, off); - } -} - -// hash_map -template -inline void _encode_complex(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode_complex(m[k], bl, off); - } -} - -// base case -template -inline void _encode_complex(const T& t, bufferlist& bl) -{ - t._encode(bl); -} -template -inline void _decode_complex(T& t, bufferlist& bl, int& off) -{ - t._decode(bl, off); -} - -#endif diff --git a/branches/sage/pgs/include/error.h b/branches/sage/pgs/include/error.h deleted file mode 100644 index a548d9756b9b8..0000000000000 --- a/branches/sage/pgs/include/error.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/branches/sage/pgs/include/filepath.h b/branches/sage/pgs/include/filepath.h deleted file mode 100644 index 4425e1d7c5b3a..0000000000000 --- a/branches/sage/pgs/include/filepath.h +++ /dev/null @@ -1,184 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include "buffer.h" - - -class filepath { - /** path - * can be relative "a/b/c" or absolute "/a/b/c". - */ - string path; - - /** bits - path segemtns - * this is ['a', 'b', 'c'] for both the aboslute and relative case. - * - * NOTE: this value is LAZILY maintained... i.e. it's a cache - */ - mutable vector bits; - - void rebuild_path() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i 0) parse_bits(); - return bits.size(); - } - bool empty() const { - return path.length() == 0; - } - - // FIXME: const-edness - bool absolute() { return path.length() && path[0] == '/'; } - bool relative() { return !absolute(); } - - const string& operator[](int i) const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[i]; - } - - const string& last_dentry() const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[ bits.size()-1 ]; - } - - filepath prefixpath(int s) const { - filepath t; - for (int i=0; i 0) parse_bits(); - bits.pop_back(); - rebuild_path(); - } - void push_dentry(const string& s) { - if (bits.empty() && path.length() > 0) parse_bits(); - bits.push_back(s); - if (path.length() && path[path.length()-1] != '/') - path += "/"; - path += s; - } - void append(const filepath& a) { - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FRAG_H -#define __FRAG_H - -#include -#include -#include -#include "buffer.h" - -/* - * - * the goal here is to use a binary split strategy to partition a namespace. - * frag_t represents a particular fragment. bits() tells you the size of the - * fragment, and value() it's name. this is roughly analogous to an ip address - * and netmask. - * - * fragtree_t represents an entire namespace and it's partition. it essentially - * tells you where fragments are split into other fragments, and by how much - * (i.e. by how many bits, resulting in a power of 2 number of child fragments). - * - * this vaguely resembles a btree, in that when a fragment becomes large or small - * we can split or merge, except that there is no guarantee of being balanced. - * presumably we are partitioning the output of a (perhaps specialized) hash - * function. - * - */ - -/** - * frag_t - * - * description of an individual fragment. that is, a particular piece - * of the overall namespace. - * - * this is conceptually analogous to an ip address and netmask. - * - * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). - * - * we write it as v/b, where v is a value and b is the number of bits. - * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, - * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. - */ - -typedef uint32_t _frag_t; - -class frag_t { - /* encoded value. - * 8 upper bits = "bits" - * 24 lower bits = "value" - */ - _frag_t _enc; - - public: - frag_t() : _enc(0) { } - frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { } - frag_t(_frag_t e) : _enc(e) { } - - // constructors - void from_unsigned(unsigned e) { _enc = e; } - - // accessors - unsigned value() const { return _enc & 0xffffff; } - unsigned bits() const { return _enc >> 24; } - unsigned mask() const { return 0xffffffff >> (32-bits()); } - operator _frag_t() const { return _enc; } - - // tests - bool contains(unsigned v) const { - return (v & mask()) == value(); - } - bool contains(frag_t sub) const { - return (sub.bits() >= bits() && // they are more specific than us, - (sub.value() & mask()) == value()); // and they are contained by us. - } - bool is_root() const { - return bits() == 0; - } - frag_t parent() const { - assert(bits() > 0); - return frag_t(value() & (mask() >> 1), bits()-1); - } - - // splitting - frag_t left_half() const { - return frag_t(value(), bits()+1); - } - frag_t right_half() const { - return frag_t(value() | (1<& fragments) const { - assert(nb > 0); - unsigned nway = 1 << (nb-1); - for (unsigned i=0; i: - // frag_t f is split by b bits. - // if child frag_t does not appear, it is not split. - std::map _splits; - - public: - // accessors - bool empty() { - return _splits.empty(); - } - int get_split(const frag_t hb) const { - std::map::const_iterator p = _splits.find(hb); - if (p == _splits.end()) - return 0; - else - return p->second; - } - void get_leaves(list& ls) const { - list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - bool contains(frag_t fg) const { - list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) { - if (t == fg) return false; // it's split. - t.split(nb, q); // queue up children - } else { - if (t == fg) return true; // it's there. - } - } - return false; - } - - frag_t operator[](unsigned v) const { - frag_t t; - while (1) { - assert(t.contains(v)); - int nb = get_split(t); - - // is this a leaf? - if (nb == 0) return t; // done. - - // pick appropriate child fragment. - unsigned nway = 1 << (nb-1); - unsigned i; - for (i=0; i copy; - std::list q; - q.push_back(frag_t()); - - while (1) { - frag_t cur = q.front(); - q.pop_front(); - int b = get_split(cur); - if (!b) continue; - copy[cur] = b; - cur.split(b, q); - } - - assert(copy == _splits); - } - - // encoding - void _encode(bufferlist& bl) { - ::_encode(_splits, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(_splits, bl, off); - } -}; - -inline ostream& operator<<(ostream& out, fragtree_t& ft) -{ - out << "fragtree_t("; - - bool first = true; - list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = ft.get_split(t); - if (nb) { - if (first) - first = false; - else - out << ' '; - out << t << '%' << nb; - t.split(nb, q); // queue up children - } - } - return out << ")"; -} - -#endif diff --git a/branches/sage/pgs/include/interval_set.h b/branches/sage/pgs/include/interval_set.h deleted file mode 100644 index 632fd6498c910..0000000000000 --- a/branches/sage/pgs/include/interval_set.h +++ /dev/null @@ -1,306 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - void clear() { - m.clear(); - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/branches/sage/pgs/include/lru.h b/branches/sage/pgs/include/lru.h deleted file mode 100644 index 225204f151a0a..0000000000000 --- a/branches/sage/pgs/include/lru.h +++ /dev/null @@ -1,323 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include -#include -using namespace std; - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - uint32_t lru_num, lru_num_pinned; - uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .9; - lru_max = max; - } - - uint32_t lru_get_size() { return lru_num; } - uint32_t lru_get_top() { return lru_top.get_length(); } - uint32_t lru_get_bot() { return lru_bot.get_length(); } - uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - uint32_t lru_get_max() { return lru_max; } - uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/branches/sage/pgs/include/object.h b/branches/sage/pgs/include/object.h deleted file mode 100644 index 955a024c0dea6..0000000000000 --- a/branches/sage/pgs/include/object.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -typedef uint32_t objectrev_t; - -struct object_t { - static const uint32_t MAXREV = 0xffffffffU; - - uint64_t ino; // "file" identifier - uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(uint64_t i, uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(uint64_t i, uint32_t b, uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - -namespace __gnu_cxx { -#ifndef __LP64__ - template<> struct hash { - size_t operator()(uint64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - - template<> struct hash { - size_t operator()(const object_t &r) const { - static hash H; - static hash I; - return H(r.ino) ^ I(r.bno); - } - }; -} - - -#endif diff --git a/branches/sage/pgs/include/oldbuffer.h b/branches/sage/pgs/include/oldbuffer.h deleted file mode 100644 index 12ddf688934bc..0000000000000 --- a/branches/sage/pgs/include/oldbuffer.h +++ /dev/null @@ -1,358 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include -#include - -#include -using namespace std; - -// bit masks -#define BUFFER_MODE_NOCOPY 0 -#define BUFFER_MODE_COPY 1 // copy on create, my buffer - -#define BUFFER_MODE_NOFREE 0 -#define BUFFER_MODE_FREE 2 - -#define BUFFER_MODE_CUSTOMFREE 4 - -#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE) - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - -#include "common/Mutex.h" - -// HACK: in config.cc -/* - * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and - * buffer ever use buffer._ref, and only bufferptr should call ~buffer(). - * - * So, I only need to protect: - * - buffer()'s modification of buffer_total_alloc - * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc - * - * I don't protect - * - buffer._get() .. increment is atomic on any sane architecture - * - buffer._put() .. only called by ~bufferptr. - * - ~buffer .. only called by ~bufferptr *** I HOPE!! - */ -extern Mutex bufferlock; -extern long buffer_total_alloc; - - -typedef void (buffer_free_func_t)(void*,char*,unsigned); - - -/* - * buffer - the underlying buffer container. with a reference count. - * - * the buffer never shrinks. - * - * some invariants: - * _len never shrinks - * _len <= _alloc_len - */ -class buffer { - protected: - //wtf - //static Mutex bufferlock; - //static long buffer_total_alloc;// = 0; - - private: - // raw buffer alloc - char *_dataptr; - bool _myptr; - unsigned _len; - unsigned _alloc_len; - - // ref counts - unsigned _ref; - int _get() { - bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl; - return ++_ref; - } - int _put() { - bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl; - assert(_ref > 0); - return --_ref; - } - - // custom (de!)allocator - buffer_free_func_t *free_func; - void *free_func_arg; - - friend class bufferptr; - - public: - // constructors - buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - } - buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - _dataptr = new char[a]; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - } - ~buffer() { - bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl; - if (free_func) { - bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl; - free_func( free_func_arg, _dataptr, _alloc_len ); - } - else if (_dataptr && _myptr) { - bdbout(1) << "buffer.free " << (void*)_dataptr << endl; - delete[] _dataptr; - buffer_total_alloc -= _alloc_len; - } - } - - buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0, - buffer_free_func_t free_func=0, void* free_func_arg=0) : - _dataptr(0), - _myptr(false), - _len(l), - _ref(0), - free_func(0), free_func_arg(0) { - - if (alloc_len) - _alloc_len = alloc_len; - else - _alloc_len = l; - - _myptr = mode & BUFFER_MODE_FREE ? true:false; - bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl; - if (mode & BUFFER_MODE_COPY) { - _dataptr = new char[_alloc_len]; - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - memcpy(_dataptr, p, l); - bdbout(1) << "buffer.copy " << *this << endl; - } else { - _dataptr = (char*)p; // ugly - bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl; - } - - if (mode & BUFFER_MODE_CUSTOMFREE && free_func) { - this->free_func = free_func; - this->free_func_arg = free_func_arg; - } - } - - // operators - buffer& operator=(buffer& other) { - assert(0); // not implemented, no reasonable assignment semantics. - return *this; - } - - char *c_str() { - return _dataptr; - } - - bool has_free_func() { return free_func != 0; } - - // accessor - unsigned alloc_length() { - return _alloc_len; - } - void set_length(unsigned l) { - assert(l <= _alloc_len); - _len = l; - } - unsigned length() { return _len; } - unsigned unused_tail_length() { return _alloc_len - _len; } - - friend ostream& operator<<(ostream& out, buffer& b); -}; - -inline ostream& operator<<(ostream& out, buffer& b) { - return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")"; -} - - -/* - * smart pointer class for buffer - * - * we reference count the actual buffer. - * we also let you refer to a subset of a buffer. - * we implement the high-level buffer accessor methods. - * - * some invariants: - * _off < _buffer->_len - * _off + _len <= _buffer->_len - */ -class bufferptr { - private: - buffer *_buffer; - unsigned _len, _off; - - public: - // empty cons - bufferptr() : - _buffer(0), - _len(0), - _off(0) { } - // main cons - the entire buffer - bufferptr(buffer *b) : - _buffer(b), - _len(b->_len), - _off(0) { - assert(_buffer->_ref == 0); - _buffer->_get(); // this is always the first one. - } - // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, unsigned len, unsigned off) { - bufferlock.Lock(); - _buffer = bp._buffer; - _len = len; - _off = bp._off + off; - _buffer->_get(); - assert(_off < _buffer->_len); // sanity checks - assert(_off + _len <= _buffer->_len); - bufferlock.Unlock(); - } - - // copy cons - bufferptr(const bufferptr &other) { - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - } - - // assignment operator - bufferptr& operator=(const bufferptr& other) { - //assert(0); - // discard old - discard_buffer(); - - // point to other - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - return *this; - } - - ~bufferptr() { - discard_buffer(); - } - - void discard_buffer() { - if (_buffer) { - bufferlock.Lock(); - if (_buffer->_put() == 0) - delete _buffer; - _buffer = 0; - bufferlock.Unlock(); - } - } - - - // dereference to get the actual buffer - buffer& operator*() { - return *_buffer; - } - - - bool at_buffer_head() const { - return _off == 0; - } - bool at_buffer_tail() const { - return _off + _len == _buffer->_len; - } - - // accessors for my subset - char *c_str() { - return _buffer->c_str() + _off; - } - unsigned length() const { - return _len; - } - unsigned offset() const { - return _off; - } - unsigned unused_tail_length() { - if (!at_buffer_tail()) return 0; - return _buffer->unused_tail_length(); - } - - - - // modifiers - void set_offset(unsigned off) { - assert(off <= _buffer->_alloc_len); - _off = off; - } - void set_length(unsigned len) { - assert(len >= 0 && _off + len <= _buffer->_alloc_len); - if (_buffer->_len < _off + len) - _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) - _len = len; // my len too - } - void zero() { - //bzero((void*)c_str(), _len); - memset((void*)c_str(), 0, _len); - } - - - // crope lookalikes - void append(const char *p, unsigned len) { - assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? - - // copy - memcpy(c_str() + _len, p, len); - _buffer->_len += len; - _len += len; - } - void copy_out(unsigned off, unsigned len, char *dest) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(dest, c_str() + off, len); - } - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(c_str() + off, src, len); - } - - friend ostream& operator<<(ostream& out, bufferptr& bp); -}; - - -inline ostream& operator<<(ostream& out, bufferptr& bp) { - return out << "bufferptr(len=" << bp._len << " off=" << bp._off - << " cstr=" << (void*)bp.c_str() - << " buf=" << *bp._buffer - << ")"; -} - - - -#endif diff --git a/branches/sage/pgs/include/oldbufferlist.h b/branches/sage/pgs/include/oldbufferlist.h deleted file mode 100644 index d6447dd6f6d20..0000000000000 --- a/branches/sage/pgs/include/oldbufferlist.h +++ /dev/null @@ -1,682 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFERLIST_H -#define __BUFFERLIST_H - -#include "buffer.h" - -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - - - -class bufferlist { - private: - /* local state limited to _buffers, and _len. - * we maintain _len ourselves, so we must be careful when fiddling with buffers! - */ - list _buffers; - unsigned _len; - - public: - // cons/des - bufferlist() : _len(0) { - bdbout(1) << "bufferlist.cons " << this << endl; - } - bufferlist(const bufferlist& bl) : _len(0) { - //assert(0); // o(n) and stupid - bdbout(1) << "bufferlist.cons " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - } - ~bufferlist() { - bdbout(1) << "bufferlist.des " << this << endl; - } - - bufferlist& operator=(bufferlist& bl) { - //assert(0); // actually, this should be fine, just slow (O(n)) and stupid. - bdbout(1) << "bufferlist.= " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - return *this; - } - - - // accessors - list& buffers() { - return _buffers; - } - //list::iterator begin() { return _buffers.begin(); } - //list::iterator end() { return _buffers.end(); } - - unsigned length() const { -#if 0 - { // DEBUG: verify _len - int len = 0; - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); - } -#endif - return _len; - } - - void _rope(crope& r) { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - r.append((*it).c_str(), (*it).length()); - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(bufferptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(buffer *b) { - bufferptr bp(b); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(bufferptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(buffer *b) { - bufferptr bp(b); - - _buffers.push_back(bp); - _len += bp.length(); - - } - void zero() { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(bufferlist& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(bufferlist& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, bufferlist& bl) { - unsigned left = len; - for (list::iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; - if (avail > len) - avail = len; - unsigned blen = _buffers.back().length(); - memcpy(_buffers.back().c_str() + blen, data, avail); - blen += avail; - _buffers.back().set_length(blen); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 1024) alen = 1024; - push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen)); - } - void append(bufferptr& bp) { - push_back(bp); - } - void append(bufferptr& bp, unsigned len, unsigned off) { - bufferptr tempbp(bp, len, off); - push_back(tempbp); - } - void append(const bufferlist& bl) { - bufferlist temp = bl; // copy list - claim_append(temp); // and append - } - - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - // make one new contiguous buffer. - bufferptr newbuf = new buffer(length()); - unsigned off = 0; - - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least! - memcpy(newbuf.c_str() + off, - (*it).c_str(), (*it).length()); - off += (*it).length(); - } - assert(off == newbuf.length()); - - _buffers.clear(); - _buffers.push_back( newbuf ); - - // now it'll work. - return c_str(); - } - } - - - void substr_of(bufferlist& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( bufferptr( *curbuf, len, off ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - // funky modifer - void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, len, off ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, howmuch, off ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - friend ostream& operator<<(ostream& out, bufferlist& bl); - -}; - -inline ostream& operator<<(ostream& out, bufferlist& bl) { - out << "bufferlist(len=" << bl.length() << endl; - for (list::iterator it = bl._buffers.begin(); - it != bl._buffers.end(); - it++) - out << "\t" << *it << endl; - out << ")" << endl; - return out; -} - - - -// encoder/decode helpers - -// string -inline void _encode(const string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = new buffer(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -// set -template -inline void _encode(set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename set::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = vector(n); - for (int i=0; i -template -inline void _encode(const list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/branches/sage/pgs/include/statlite.h b/branches/sage/pgs/include/statlite.h deleted file mode 100644 index a9c0433e4a4e8..0000000000000 --- a/branches/sage/pgs/include/statlite.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/branches/sage/pgs/include/types.h b/branches/sage/pgs/include/types.h deleted file mode 100644 index d13937b39da3d..0000000000000 --- a/branches/sage/pgs/include/types.h +++ /dev/null @@ -1,336 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "object.h" -#include "utime.h" - - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash { - size_t operator()(int64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef uint64_t tid_t; // transaction id -typedef uint64_t version_t; -typedef uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - -// object and pg layout -// specified in g_conf.osd_* - -#define O_LAZY 01000000 - - -/** object layout - * how objects are mapped into PGs - */ -#define OBJECT_LAYOUT_HASH 1 -#define OBJECT_LAYOUT_LINEAR 2 -#define OBJECT_LAYOUT_HASHINO 3 - -/** pg layout - * how PGs are mapped into (sets of) OSDs - */ -#define PG_LAYOUT_CRUSH 0 -#define PG_LAYOUT_HASH 1 -#define PG_LAYOUT_LINEAR 2 -#define PG_LAYOUT_HYBRID 3 - - - -// ----------------------- -// FileLayout - -/** FileLayout - * specifies a striping and replication strategy - */ - -//#define FILE_LAYOUT_CRUSH 0 // stripe via crush -//#define FILE_LAYOUT_LINEAR 1 // stripe linearly across cluster - -struct FileLayout { - // -- file -> object mapping -- - int stripe_unit; // stripe unit, in bytes - int stripe_count; // over this many objects - int object_size; // until objects are this big, then move to new objects - - int stripe_width() { return stripe_unit * stripe_count; } - - // period = bytes before i start on a new set of objects. - int period() { return object_size * stripe_count; } - - // -- object -> pg layout -- - char pg_type; // pg type (replicated, raid, etc.) (see pg_t::TYPE_*) - char pg_size; // pg size (num replicas, or raid4 stripe width) - int preferred; // preferred primary osd? - - // -- pg -> disk layout -- - int object_stripe_unit; // for per-object raid - - FileLayout() { } - FileLayout(int su, int sc, int os, int pgt, int pgs, int o=-1) : - stripe_unit(su), stripe_count(sc), object_size(os), - pg_type(pgt), pg_size(pgs), preferred(o), - object_stripe_unit(su) // note: bad default, we pbly want su/(pgs-1) - { - assert(object_size % stripe_unit == 0); - } - -}; - - - - -// -------------------------------------- -// inode - -typedef uint64_t _inodeno_t; - -struct inodeno_t { - _inodeno_t val; - inodeno_t() : val(0) {} - inodeno_t(_inodeno_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator _inodeno_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static hash H; - return H(x.val); - } - }; -} - - -#define INODE_MODE_FILE 0100000 // S_IFREG -#define INODE_MODE_SYMLINK 0120000 // S_IFLNK -#define INODE_MODE_DIR 0040000 // S_IFDIR -#define INODE_TYPE_MASK 0170000 - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -#define INODE_MASK_BASE 1 // ino, layout, symlink value -#define INODE_MASK_AUTH 2 // uid, gid, mode -#define INODE_MASK_LINK 4 // nlink, anchored -#define INODE_MASK_FILE 8 // mtime, size. -// atime? - -#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_AUTH|INODE_MASK_LINK|INODE_MASK_FILE) - -#define INODE_MASK_SIZE INODE_MASK_FILE // size, blksize, blocks -#define INODE_MASK_MTIME INODE_MASK_FILE // mtime -#define INODE_MASK_ATIME INODE_MASK_FILE // atime -#define INODE_MASK_CTIME (INODE_MASK_FILE|INODE_MASK_AUTH|INODE_MASK_LINK) // ctime - -struct inode_t { - // base (immutable) - inodeno_t ino; - FileLayout layout; // ?immutable? - - // affected by any inode change... - utime_t ctime; // inode change time - - // perm (namespace permissions) - mode_t mode; - uid_t uid; - gid_t gid; - - // nlink - int nlink; - bool anchored; // auth only? - - // file (data access) - off_t size, max_size, allocated_size; - utime_t mtime; // file data modify time. - utime_t atime; // file data access time. - - // special stuff - int mask; // used for client stat. hack. - version_t version; // auth only - version_t file_data_version; // auth only - - bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } -}; - - - - - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, pair v) { - return out << v.first << "," << v.second; -} - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const list& ilist) { - for (typename list::const_iterator it = ilist.begin(); - it != ilist.end(); - it++) { - if (it != ilist.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - -#endif diff --git a/branches/sage/pgs/include/uofs.h b/branches/sage/pgs/include/uofs.h deleted file mode 100644 index a4673aaa616ea..0000000000000 --- a/branches/sage/pgs/include/uofs.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/branches/sage/pgs/include/utime.h b/branches/sage/pgs/include/utime.h deleted file mode 100644 index 03fed6744f4e2..0000000000000 --- a/branches/sage/pgs/include/utime.h +++ /dev/null @@ -1,147 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __UTIME_H -#define __UTIME_H - -#include -#include -#include - -// -------- -// utime_t - -typedef struct timeval _utime_t; - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - //utime_t(time_t s) { tv.tv_sec = s; tv.tv_usec = 0; } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - utime_t(const _utime_t &v) : tv(v) {} - /* - utime_t(double d) { - tv.tv_sec = (time_t)trunc(d); - tv.tv_usec = (__suseconds_t)((d - tv.tv_sec) / (double)1000000.0); - } - */ - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) / (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - out.setf(std::ios::right); - out.fill('0'); - if (t.sec() < ((time_t)(60*60*24*365*10))) { - // raw seconds. this looks like a relative time. - out << (long)t.sec(); - } else { - // localtime. this looks like an absolute time. - struct tm bdt; - time_t tt = t.sec(); - localtime_r(&tt, &bdt); - out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07' - << std::setw(2) << bdt.tm_mon - << std::setw(2) << bdt.tm_mday - << "." - << std::setw(2) << bdt.tm_hour - << std::setw(2) << bdt.tm_min - << std::setw(2) << bdt.tm_sec; - } - out << "."; - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; -} - -#endif diff --git a/branches/sage/pgs/jobs/alc.tp b/branches/sage/pgs/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/branches/sage/pgs/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/branches/sage/pgs/jobs/alcdat/makedirs b/branches/sage/pgs/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/makedirs.big b/branches/sage/pgs/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/makedirs.tput b/branches/sage/pgs/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/makefiles.shared b/branches/sage/pgs/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/openshared b/branches/sage/pgs/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/ossh.include b/branches/sage/pgs/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/ossh.include.big b/branches/sage/pgs/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/ossh.lib b/branches/sage/pgs/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/ossh.lib.big b/branches/sage/pgs/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/alcdat/striping b/branches/sage/pgs/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/branches/sage/pgs/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/branches/sage/pgs/jobs/example b/branches/sage/pgs/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/branches/sage/pgs/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/branches/sage/pgs/jobs/mds/log_striping b/branches/sage/pgs/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/branches/sage/pgs/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/mds/makedir_lat b/branches/sage/pgs/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/branches/sage/pgs/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/branches/sage/pgs/jobs/mds/makedirs b/branches/sage/pgs/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/branches/sage/pgs/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/mds/opensshlib b/branches/sage/pgs/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/branches/sage/pgs/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/meta1 b/branches/sage/pgs/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/branches/sage/pgs/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/branches/sage/pgs/jobs/meta1.proc.sh b/branches/sage/pgs/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/branches/sage/pgs/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/branches/sage/pgs/jobs/osd/ebofs b/branches/sage/pgs/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/branches/sage/pgs/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/pgs/jobs/osd/mds_log b/branches/sage/pgs/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/branches/sage/pgs/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/pgs/jobs/osd/osd_threads b/branches/sage/pgs/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/branches/sage/pgs/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/branches/sage/pgs/jobs/osd/striping b/branches/sage/pgs/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/branches/sage/pgs/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/branches/sage/pgs/jobs/osd/wr_lat2 b/branches/sage/pgs/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/branches/sage/pgs/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/branches/sage/pgs/jobs/osd/write_sizes b/branches/sage/pgs/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/branches/sage/pgs/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/pgs/jobs/rados/map_dist b/branches/sage/pgs/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/branches/sage/pgs/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/branches/sage/pgs/jobs/rados/rep_lat b/branches/sage/pgs/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/branches/sage/pgs/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/branches/sage/pgs/jobs/rados/wr_sizes b/branches/sage/pgs/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/branches/sage/pgs/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/branches/sage/pgs/mds/Anchor.h b/branches/sage/pgs/mds/Anchor.h deleted file mode 100644 index 9ead7bb599c7f..0000000000000 --- a/branches/sage/pgs/mds/Anchor.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "mdstypes.h" -#include "include/buffer.h" - - -// anchor ops -#define ANCHOR_OP_LOOKUP 1 -#define ANCHOR_OP_LOOKUP_REPLY 2 - -#define ANCHOR_OP_CREATE_PREPARE 11 -#define ANCHOR_OP_CREATE_AGREE 12 - -#define ANCHOR_OP_DESTROY_PREPARE 21 -#define ANCHOR_OP_DESTROY_AGREE 22 - -#define ANCHOR_OP_UPDATE_PREPARE 31 -#define ANCHOR_OP_UPDATE_AGREE 32 - -#define ANCHOR_OP_COMMIT 41 -#define ANCHOR_OP_ACK 42 -#define ANCHOR_OP_ROLLBACK 43 - - - -inline const char* get_anchor_opname(int o) { - switch (o) { - case ANCHOR_OP_LOOKUP: return "lookup"; - case ANCHOR_OP_LOOKUP_REPLY: return "lookup_reply"; - - case ANCHOR_OP_CREATE_PREPARE: return "create_prepare"; - case ANCHOR_OP_CREATE_AGREE: return "create_agree"; - case ANCHOR_OP_DESTROY_PREPARE: return "destroy_prepare"; - case ANCHOR_OP_DESTROY_AGREE: return "destroy_agree"; - case ANCHOR_OP_UPDATE_PREPARE: return "update_prepare"; - case ANCHOR_OP_UPDATE_AGREE: return "update_agree"; - - case ANCHOR_OP_COMMIT: return "commit"; - case ANCHOR_OP_ACK: return "ack"; - case ANCHOR_OP_ROLLBACK: return "rollback"; - default: assert(0); - } -} - - -// identifies a anchor table mutation - - - -// anchor type - -class Anchor { -public: - inodeno_t ino; // anchored ino - dirfrag_t dirfrag; // containing dirfrag - //string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t i, dirfrag_t df, - //string& rd, - int nr=0) : - ino(i), dirfrag(df), - //ref_dn(rd), - nref(nr) { } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nref, sizeof(nref)); - //::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - //::_decode(ref_dn, bl, off); - } -}; - -inline ostream& operator<<(ostream& out, Anchor& a) -{ - return out << "a(" << a.ino << " " << a.dirfrag << " " << a.nref << ")"; -} - -#endif diff --git a/branches/sage/pgs/mds/AnchorClient.cc b/branches/sage/pgs/mds/AnchorClient.cc deleted file mode 100644 index 13182f1cadf95..0000000000000 --- a/branches/sage/pgs/mds/AnchorClient.cc +++ /dev/null @@ -1,372 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; -using std::endl; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" -#include "MDLog.h" - -#include "events/EAnchorClient.h" -#include "messages/MAnchor.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_reply((MAnchor*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchor *m) -{ - inodeno_t ino = m->get_ino(); - version_t atid = m->get_atid(); - - dout(10) << "handle_anchor_reply " << *m << endl; - - switch (m->get_op()) { - - // lookup - case ANCHOR_OP_LOOKUP_REPLY: - assert(pending_lookup.count(ino)); - { - *pending_lookup[ino].trace = m->get_trace(); - Context *onfinish = pending_lookup[ino].onfinish; - pending_lookup.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - // prepare -> agree - case ANCHOR_OP_CREATE_AGREE: - if (pending_create_prepare.count(ino)) { - dout(10) << "got create_agree on " << ino << " atid " << atid << endl; - Context *onfinish = pending_create_prepare[ino].onfinish; - *pending_create_prepare[ino].patid = atid; - pending_create_prepare.erase(ino); - - pending_commit.insert(atid); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_DESTROY_AGREE: - if (pending_destroy_prepare.count(ino)) { - dout(10) << "got destroy_agree on " << ino << " atid " << atid << endl; - Context *onfinish = pending_destroy_prepare[ino].onfinish; - *pending_destroy_prepare[ino].patid = atid; - pending_destroy_prepare.erase(ino); - - pending_commit.insert(atid); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_UPDATE_AGREE: - if (pending_update_prepare.count(ino)) { - dout(10) << "got update_agree on " << ino << " atid " << atid << endl; - Context *onfinish = pending_update_prepare[ino].onfinish; - *pending_update_prepare[ino].patid = atid; - pending_update_prepare.erase(ino); - - pending_commit.insert(atid); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - // commit -> ack - case ANCHOR_OP_ACK: - { - dout(10) << "got ack on atid " << atid << ", logging" << endl; - - // remove from committing list - assert(pending_commit.count(atid)); - pending_commit.erase(atid); - - // log ACK. - mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid)); - - // kick any waiters - if (ack_waiters.count(atid)) { - dout(15) << "kicking waiters on atid " << atid << endl; - mds->queue_waiters(ack_waiters[atid]); - ack_waiters.erase(atid); - } - } - break; - - default: - assert(0); - } - - delete m; -} - - - -/* - * public async interface - */ - - -/* - * FIXME: we need to be able to resubmit messages if the anchortable mds fails. - */ - - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_LOOKUP, ino); - - assert(pending_lookup.count(ino) == 0); - pending_lookup[ino].onfinish = onfinish; - pending_lookup[ino].trace = &trace; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// PREPARE - -void AnchorClient::prepare_create(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_create " << ino << " " << trace << endl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_CREATE_PREPARE, ino); - req->set_trace(trace); - - pending_create_prepare[ino].trace = trace; - pending_create_prepare[ino].patid = patid; - pending_create_prepare[ino].onfinish = onfinish; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::prepare_destroy(inodeno_t ino, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_destroy " << ino << endl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_DESTROY_PREPARE, ino); - pending_destroy_prepare[ino].onfinish = onfinish; - pending_destroy_prepare[ino].patid = patid; - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -void AnchorClient::prepare_update(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_update " << ino << " " << trace << endl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_UPDATE_PREPARE, ino); - req->set_trace(trace); - - pending_update_prepare[ino].trace = trace; - pending_update_prepare[ino].patid = patid; - pending_update_prepare[ino].onfinish = onfinish; - - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// COMMIT - -void AnchorClient::commit(version_t atid) -{ - dout(10) << "commit " << atid << endl; - - assert(pending_commit.count(atid)); - pending_commit.insert(atid); - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - - -// RECOVERY - -void AnchorClient::finish_recovery() -{ - dout(7) << "finish_recovery" << endl; - - resend_commits(); -} - -void AnchorClient::resend_commits() -{ - for (set::iterator p = pending_commit.begin(); - p != pending_commit.end(); - ++p) { - dout(10) << "resending commit on " << *p << endl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, *p); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorClient::resend_prepares(hash_map& prepares, int op) -{ - for (hash_map::iterator p = prepares.begin(); - p != prepares.end(); - p++) { - dout(10) << "resending " << get_anchor_opname(op) << " on " << p->first << endl; - MAnchor *req = new MAnchor(op, p->first); - req->set_trace(p->second.trace); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - - -void AnchorClient::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << endl; - - if (who != mds->mdsmap->get_anchortable()) - return; // do nothing. - - // resend any pending lookups. - for (hash_map::iterator p = pending_lookup.begin(); - p != pending_lookup.end(); - p++) { - dout(10) << "resending lookup on " << p->first << endl; - mds->send_message_mds(new MAnchor(ANCHOR_OP_LOOKUP, p->first), - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - - // resend any pending prepares. - resend_prepares(pending_create_prepare, ANCHOR_OP_CREATE_PREPARE); - resend_prepares(pending_update_prepare, ANCHOR_OP_UPDATE_PREPARE); - resend_prepares(pending_destroy_prepare, ANCHOR_OP_DESTROY_PREPARE); - - // resend any pending commits. - resend_commits(); -} diff --git a/branches/sage/pgs/mds/AnchorClient.h b/branches/sage/pgs/mds/AnchorClient.h deleted file mode 100644 index 6ec5603b0bc7e..0000000000000 --- a/branches/sage/pgs/mds/AnchorClient.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Context; -class MDS; - -class AnchorClient : public Dispatcher { - MDS *mds; - - // lookups - struct _pending_lookup { - vector *trace; - Context *onfinish; - }; - hash_map pending_lookup; - - // prepares - struct _pending_prepare { - vector trace; - Context *onfinish; - version_t *patid; // ptr to atid - }; - hash_map pending_create_prepare; - hash_map pending_destroy_prepare; - hash_map pending_update_prepare; - - // pending commits - set pending_commit; - map > ack_waiters; - - void handle_anchor_reply(class MAnchor *m); - -public: - AnchorClient(MDS *m) : mds(m) {} - - void dispatch(Message *m); - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - - void prepare_create(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); - void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - - void commit(version_t atid); - - // for recovery (by other nodes) - void handle_mds_recovery(int mds); // called when someone else recovers - - void resend_commits(); - void resend_prepares(hash_map& prepares, int op); - - // for recovery (by me) - void got_journaled_agree(version_t atid) { - pending_commit.insert(atid); - } - void got_journaled_ack(version_t atid) { - pending_commit.erase(atid); - } - bool has_committed(version_t atid) { - return pending_commit.count(atid) == 0; - } - void wait_for_ack(version_t atid, Context *c) { - ack_waiters[atid].push_back(c); - } - void finish_recovery(); // called when i recover and go active - - -}; - -#endif diff --git a/branches/sage/pgs/mds/AnchorTable.cc b/branches/sage/pgs/mds/AnchorTable.cc deleted file mode 100644 index fbf866e5a7aa1..0000000000000 --- a/branches/sage/pgs/mds/AnchorTable.cc +++ /dev/null @@ -1,715 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchor.h" - -#include "common/Clock.h" - -#include "MDLog.h" -#include "events/EAnchor.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " - - -void AnchorTable::dump() -{ - dout(7) << "dump v " << version << endl; - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) - dout(15) << "dump " << it->second << endl; -} - - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, dirfrag_t dirfrag) -{ - //dout(17) << "add " << ino << " dirfrag " << dirfrag << endl; - - // parent should be there - assert(dirfrag.ino < MDS_INO_BASE || // system dirino - anchor_map.count(dirfrag.ino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ino] = Anchor(ino, dirfrag); - dout(7) << "add added " << anchor_map[ino] << endl; - return true; - } else { - dout(7) << "add had " << anchor_map[ino] << endl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << ino << endl; - - assert(anchor_map.count(ino)); - - while (1) { - Anchor &anchor = anchor_map[ino]; - anchor.nref++; - - dout(10) << "inc now " << anchor << endl; - ino = anchor.dirfrag.ino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << ino << endl; - assert(anchor_map.count(ino)); - - while (true) { - Anchor &anchor = anchor_map[ino]; - anchor.nref--; - - if (anchor.nref == 0) { - dout(10) << "dec removing " << anchor << endl; - dirfrag_t dirfrag = anchor.dirfrag; - anchor_map.erase(ino); - ino = dirfrag.ino; - } else { - dout(10) << "dec now " << anchor << endl; - ino = anchor.dirfrag.ino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - - -/* - * high level - */ - - -// LOOKUP - -void AnchorTable::handle_lookup(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "handle_lookup " << ino << endl; - - assert(anchor_map.count(ino) == 1); - Anchor &anchor = anchor_map[ino]; - - vector trace; - while (true) { - dout(10) << "handle_lookup adding " << anchor << endl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor.dirfrag.ino < MDS_INO_BASE) break; - - assert(anchor_map.count(anchor.dirfrag.ino) == 1); - anchor = anchor_map[anchor.dirfrag.ino]; - } - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_LOOKUP_REPLY, ino); - reply->set_trace(trace); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - -// MIDLEVEL - -void AnchorTable::create_prepare(inodeno_t ino, vector& trace, int reqmds) -{ - // make sure trace is in table - for (unsigned i=0; i& trace, int reqmds) -{ - version++; - pending_update[version].first = ino; - pending_update[version].second = trace; - pending_reqmds[version] = reqmds; - //dump(); -} - -void AnchorTable::commit(version_t atid) -{ - if (pending_create.count(atid)) { - dout(7) << "commit " << atid << " create " << pending_create[atid] << endl; - pending_create.erase(atid); - } - - else if (pending_destroy.count(atid)) { - inodeno_t ino = pending_destroy[atid]; - dout(7) << "commit " << atid << " destroy " << ino << endl; - - dec(ino); // destroy - - pending_destroy.erase(atid); - } - - else if (pending_update.count(atid)) { - inodeno_t ino = pending_update[atid].first; - vector &trace = pending_update[atid].second; - - dout(7) << "commit " << atid << " update " << ino << endl; - - // remove old - dec(ino); - - // add new - for (unsigned i=0; i_create_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_create_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_create_prepare " << ino << endl; - - create_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_CREATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_CreatePrepare(this, req, version)); -} - -void AnchorTable::_create_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_create_prepare_logged " << ino << " atid " << atid << endl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - - - -// DESTROY - -class C_AT_DestroyPrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_DestroyPrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_destroy_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_destroy_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "handle_destroy_prepare " << ino << endl; - - destroy_prepare(ino, req->get_source().num()); - - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_DESTROY_PREPARE, ino, version, req->get_source().num()), - new C_AT_DestroyPrepare(this, req, version)); -} - -void AnchorTable::_destroy_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_destroy_prepare_logged " << ino << " atid " << atid << endl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// UPDATE - -class C_AT_UpdatePrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_UpdatePrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_update_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_update_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_update_prepare " << ino << endl; - - update_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_UPDATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_UpdatePrepare(this, req, version)); -} - -void AnchorTable::_update_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_update_prepare_logged " << ino << " atid " << atid << endl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// COMMIT - -class C_AT_Commit : public Context { - AnchorTable *at; - MAnchor *req; -public: - C_AT_Commit(AnchorTable *a, MAnchor *r) : - at(a), req(r) { } - void finish(int r) { - at->_commit_logged(req); - } -}; - -void AnchorTable::handle_commit(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_commit " << atid << endl; - - if (pending_create.count(atid) || - pending_destroy.count(atid) || - pending_update.count(atid)) { - commit(atid); - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_COMMIT, atid, version)); - } - else if (atid <= version) { - dout(0) << "got commit for atid " << atid << " <= " << version - << ", already committed, sending ack." - << endl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, 0, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; - return; - } - else { - // wtf. - dout(0) << "got commit for atid " << atid << " > " << version << endl; - assert(atid <= version); - } - - // wait for it to journal - mds->mdlog->wait_for_sync(new C_AT_Commit(this, req)); -} - - -void AnchorTable::_commit_logged(MAnchor *req) -{ - dout(7) << "_commit_logged, sending ACK" << endl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, req->get_ino(), req->get_atid()); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// ROLLBACK - -void AnchorTable::handle_rollback(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_rollback " << atid << endl; - rollback(atid); - delete req; -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_request((MAnchor*)m); - break; - - default: - assert(0); - } -} - - -void AnchorTable::handle_anchor_request(class MAnchor *req) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << endl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds, req)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - dout(10) << "handle_anchor_request " << *req << endl; - - // go - switch (req->get_op()) { - - case ANCHOR_OP_LOOKUP: - handle_lookup(req); - break; - - case ANCHOR_OP_CREATE_PREPARE: - handle_create_prepare(req); - break; - case ANCHOR_OP_DESTROY_PREPARE: - handle_destroy_prepare(req); - break; - case ANCHOR_OP_UPDATE_PREPARE: - handle_update_prepare(req); - break; - - case ANCHOR_OP_COMMIT: - handle_commit(req); - break; - - case ANCHOR_OP_ROLLBACK: - handle_rollback(req); - break; - - default: - assert(0); - } - -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -class C_AT_Saved : public Context { - AnchorTable *at; - version_t version; -public: - C_AT_Saved(AnchorTable *a, version_t v) : at(a), version(v) {} - void finish(int r) { - at->_saved(version); - } -}; - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save v " << version << endl; - if (!opened) { - assert(!onfinish); - return; - } - - if (onfinish) - waiting_for_save[version].push_back(onfinish); - - if (committing_version == version) { - dout(7) << "save already committing v " << version << endl; - return; - } - committing_version = version; - - // build up write - bufferlist bl; - - // version - bl.append((char*)&version, sizeof(version)); - - // # anchors - size_t size = anchor_map.size(); - bl.append((char*)&size, sizeof(size)); - - // anchors - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - it->second._encode(bl); - dout(15) << "save encoded " << it->second << endl; - } - - // pending - ::_encode(pending_reqmds, bl); - ::_encode(pending_create, bl); - ::_encode(pending_destroy, bl); - - size_t s = pending_update.size(); - bl.append((char*)&s, sizeof(s)); - for (map > >::iterator p = pending_update.begin(); - p != pending_update.end(); - ++p) { - bl.append((char*)&p->first, sizeof(p->first)); - bl.append((char*)&p->second.first, sizeof(p->second.first)); - ::_encode(p->second.second, bl); - } - - // write! - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->write(oid, - 0, bl.length(), - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - bl, - NULL, new C_AT_Saved(this, version)); -} - -void AnchorTable::_saved(version_t v) -{ - dout(7) << "_saved v " << v << endl; - - assert(v <= committing_version); - assert(committed_version < v); - committed_version = v; - - finish_contexts(waiting_for_save[v], 0); - waiting_for_save.erase(v); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - bufferlist bl; - C_AT_Load(AnchorTable *a) : at(a) {} - void finish(int result) { - assert(result > 0); - at->_loaded(bl); - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << endl; - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_Load *fin = new C_AT_Load(this); - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->read(oid, - 0, 0, - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - &fin->bl, fin); -} - -void AnchorTable::_loaded(bufferlist& bl) -{ - dout(10) << "_loaded got " << bl.length() << " bytes" << endl; - - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - - size_t size; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - - for (size_t n=0; n::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) - resend_agree(p->first, p->second); -} - - -void AnchorTable::resend_agree(version_t v, int who) -{ - if (pending_create.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, pending_create[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else if (pending_destroy.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, pending_destroy[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else { - assert(pending_update.count(v)); - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, pending_update[v].first, v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorTable::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << endl; - - // resend agrees for recovered mds - for (map::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) { - if (p->second != who) continue; - resend_agree(p->first, p->second); - } -} diff --git a/branches/sage/pgs/mds/AnchorTable.h b/branches/sage/pgs/mds/AnchorTable.h deleted file mode 100644 index 64a2002ba7c85..0000000000000 --- a/branches/sage/pgs/mds/AnchorTable.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; -class MAnchor; - -class AnchorTable { - MDS *mds; - - // keep the entire table in memory. - hash_map anchor_map; - - // uncommitted operations - map pending_reqmds; - map pending_create; - map pending_destroy; - map > > pending_update; - - version_t version; // this includes anchor_map AND pending_* state. - version_t committing_version; - version_t committed_version; - - // load/save state - bool opening, opened; - - // waiters - list waiting_for_open; - map > waiting_for_save; - -protected: - - // basic updates - bool add(inodeno_t ino, dirfrag_t dirfrag); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - // mid-level - void create_prepare(inodeno_t ino, vector& trace, int reqmds); - void destroy_prepare(inodeno_t ino, int reqmds); - void update_prepare(inodeno_t ino, vector& trace, int reqmds); - void commit(version_t atid); - void rollback(version_t atid); - friend class EAnchor; // used for journal replay. - - // high level interface - void handle_lookup(MAnchor *req); - - void handle_create_prepare(MAnchor *req); - void _create_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_CreatePrepare; - - void handle_destroy_prepare(MAnchor *req); - void _destroy_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_DestroyPrepare; - - void handle_update_prepare(MAnchor *req); - void _update_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_UpdatePrepare; - - void handle_commit(MAnchor *req); - void _commit_logged(MAnchor *req); - friend class C_AT_Commit; - - void handle_rollback(MAnchor *req); - - // messages - void handle_anchor_request(MAnchor *m); - - void dump(); - -public: - AnchorTable(MDS *m) : - mds(m), - version(0), committing_version(0), committed_version(0), - opening(false), opened(false) { } - - void dispatch(class Message *m); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - void create_fresh() { - // reset (i.e. on mkfs) to empty, but unsaved table. - version = 1; - opened = true; - opening = false; - anchor_map.clear(); - pending_create.clear(); - pending_destroy.clear(); - pending_update.clear(); - } - - // load/save entire table for now! - void save(Context *onfinish); - void _saved(version_t v); - void load(Context *onfinish); - void _loaded(bufferlist& bl); - - // recovery - void handle_mds_recovery(int who); - void finish_recovery(); - void resend_agree(version_t v, int who); - -}; - -#endif diff --git a/branches/sage/pgs/mds/CDentry.cc b/branches/sage/pgs/mds/CDentry.cc deleted file mode 100644 index 2db36a7a187d8..0000000000000 --- a/branches/sage/pgs/mds/CDentry.cc +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" -#include "Anchor.h" - -#include "MDS.h" -#include "MDCache.h" - -#include "messages/MLock.h" - -#include - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") " - -ostream& CDentry::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "; -} - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - string path; - dn.make_path(path); - - out << "[dentry " << path; - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) out << " REMOTE"; - - out << " " << dn.lock; - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.is_new()) out << " state=new"; - - if (dn.get_num_ref()) { - out << " |"; - dn.print_pin_set(out); - } - - out << " " << &dn; - out << "]"; - return out; -} - - -bool operator<(const CDentry& l, const CDentry& r) -{ - if (l.get_dir()->ino() < r.get_dir()->ino()) return true; - if (l.get_dir()->ino() == r.get_dir()->ino() && - l.get_name() < r.get_name()) return true; - return false; -} - - -void CDentry::print(ostream& out) -{ - out << *this; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -pair CDentry::authority() -{ - return dir->authority(); -} - - -void CDentry::add_waiter(int tag, Context *c) -{ - // wait on the directory? - if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) { - dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - - -version_t CDentry::pre_dirty(version_t min) -{ - projected_version = dir->pre_dirty(min); - dout(10) << " pre_dirty " << *this << endl; - return projected_version; -} - - -void CDentry::_mark_dirty() -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dir->inc_num_dirty(); - get(PIN_DIRTY); - } -} - -void CDentry::mark_dirty(version_t pv) -{ - dout(10) << " mark_dirty " << *this << endl; - - // i now live in this new dir version - assert(pv == projected_version); - version = pv; - _mark_dirty(); - - // mark dir too - dir->mark_dirty(pv); -} - - -void CDentry::mark_clean() -{ - dout(10) << " mark_clean " << *this << endl; - assert(is_dirty()); - assert(version <= dir->get_version()); - - // state+pin - state_clear(STATE_DIRTY); - dir->dec_num_dirty(); - put(PIN_DIRTY); - - if (state_test(STATE_NEW)) - state_clear(STATE_NEW); -} - -void CDentry::mark_new() -{ - dout(10) << " mark_new " << *this << endl; - state_set(STATE_NEW); -} - -void CDentry::make_path(string& s) -{ - if (dir) { - dir->inode->make_path(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - -void CDentry::make_path(string& s, inodeno_t tobase) -{ - assert(dir); - - if (dir->inode->is_root()) { - s += "/"; // make it an absolute path (no matter what) if we hit the root. - } - else if (dir->inode->get_parent_dn() && - dir->inode->ino() != tobase) { - dir->inode->get_parent_dn()->make_path(s, tobase); - s += "/"; - } - s += name; -} - -/** make_anchor_trace - * construct an anchor trace for this dentry, as if it were linked to *in. - */ -void CDentry::make_anchor_trace(vector& trace, CInode *in) -{ - // start with parent dir inode - if (dir) - dir->inode->make_anchor_trace(trace); - - // add this inode (in my dirfrag) to the end - trace.push_back(Anchor(in->ino(), dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << endl; -} - - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - -// ---------------------------- -// auth pins - -bool CDentry::can_auth_pin() -{ - assert(dir); - return dir->can_auth_pin(); -} - -void CDentry::auth_pin() -{ - assert(dir); - dir->auth_pin(); -} - -void CDentry::auth_unpin() -{ - assert(dir); - dir->auth_unpin(); -} - - -// ---------------------------- -// locking - -void CDentry::set_object_info(MDSCacheObjectInfo &info) -{ - info.dirfrag = dir->dirfrag(); - info.dname = name; -} - -void CDentry::encode_lock_state(int type, bufferlist& bl) -{ - // null, ino, or remote_ino? - int c; - if (is_primary()) { - c = 1; - ::_encode(c, bl); - ::_encode(inode->inode.ino, bl); - } - else if (is_remote()) { - c = 2; - ::_encode(c, bl); - ::_encode(remote_ino, bl); - } - else if (is_null()) { - // encode nothing. - } - else assert(0); -} - -void CDentry::decode_lock_state(int type, bufferlist& bl) -{ - if (bl.length() == 0) { - // null - assert(is_null()); - return; - } - - int off = 0; - char c; - inodeno_t ino; - ::_decode(c, bl, off); - - switch (c) { - case 1: - case 2: - _decode(ino, bl, off); - // newly linked? - if (is_null() && !is_auth()) { - // force trim from cache! - dout(10) << "decode_lock_state replica dentry null -> non-null, must trim" << endl; - //assert(get_num_ref() == 0); - } else { - // verify? - - } - break; - default: - assert(0); - } -} diff --git a/branches/sage/pgs/mds/CDentry.h b/branches/sage/pgs/mds/CDentry.h deleted file mode 100644 index 96eac0a44f32d..0000000000000 --- a/branches/sage/pgs/mds/CDentry.h +++ /dev/null @@ -1,291 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "mdstypes.h" - -#include "SimpleLock.h" - -class CInode; -class CDir; -class MDRequest; - -class Message; -class CDentryDiscover; -class Anchor; - -class CDentry; - -// define an ordering -bool operator<(const CDentry& l, const CDentry& r); - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // -- state -- - static const int STATE_NEW = 1; - - // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - default: return generic_pin_name(p); - } - }; - - // -- wait -- - static const int WAIT_LOCK_OFFSET = 8; - - void add_waiter(int tag, Context *c); - - static const int EXPORT_NONCE = 1; - - bool is_lt(const MDSCacheObject *r) const { - return *this < *(CDentry*)r; - } - - protected: - string name; - CInode *inode; - CDir *dir; - - inodeno_t remote_ino; // if remote dentry - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - -public: - // lock - SimpleLock lock; - - - - public: - // cons - CDentry() : - inode(0), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, inodeno_t ino, CInode *in=0) : - name(n), - inode(in), - dir(0), - remote_ino(ino), - version(0), - projected_version(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, CInode *in) : - name(n), - inode(in), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - - CInode *get_inode() const { return inode; } - CDir *get_dir() const { return dir; } - const string& get_name() const { return name; } - inodeno_t get_ino(); - inodeno_t get_remote_ino() { return remote_ino; } - - void set_remote_ino(inodeno_t ino) { remote_ino = ino; } - - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - // auth pins - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // misc - void make_path(string& p); - void make_path(string& p, inodeno_t tobase); - void make_anchor_trace(vector& trace, CInode *in); - - // -- version -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - pair authority(); - - version_t pre_dirty(version_t min=0); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - void mark_new(); - bool is_new() { return state_test(STATE_NEW); } - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- exporting - // note: this assumes the dentry already exists. - // i.e., the name is already extracted... so we just need the other state. - void encode_export_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&projected_version, sizeof(projected_version)); - lock._encode(bl); - ::_encode(replicas, bl); - - // twiddle - clear_replicas(); - replica_nonce = EXPORT_NONCE; - state_clear(CDentry::STATE_AUTH); - if (is_dirty()) - mark_clean(); - } - void decode_import_state(bufferlist& bl, int& off, int from, int to) { - int nstate; - bl.copy(off, sizeof(nstate), (char*)&nstate); - off += sizeof(nstate); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(projected_version), (char*)&projected_version); - off += sizeof(projected_version); - lock._decode(bl, off); - ::_decode(replicas, bl, off); - - // twiddle - state = 0; - state_set(CDentry::STATE_AUTH); - if (nstate & STATE_DIRTY) - _mark_dirty(); - if (!replicas.empty()) - get(PIN_REPLICATED); - add_replica(from, EXPORT_NONCE); - if (is_replica(to)) - remove_replica(to); - } - - // -- locking -- - SimpleLock* get_lock(int type) { - assert(type == LOCK_OTYPE_DN); - return &lock; - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - - inodeno_t remote_ino; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->lock.get_replica_state()), - remote_ino(dn->get_remote_ino()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - bool is_remote() { return remote_ino ? true:false; } - inodeno_t get_remote_ino() { return remote_ino; } - - void update_dentry(CDentry *dn) { - dn->set_replica_nonce( replica_nonce ); - } - void init_dentry_lock(CDentry *dn) { - dn->lock.set_state( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - ::_encode(remote_ino, bl); - ::_encode(replica_nonce, bl); - ::_encode(lockstate, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - ::_decode(remote_ino, bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(lockstate, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/pgs/mds/CDir.cc b/branches/sage/pgs/mds/CDir.cc deleted file mode 100644 index ed8d7e222a599..0000000000000 --- a/branches/sage/pgs/mds/CDir.cc +++ /dev/null @@ -1,1423 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include "osdc/Objecter.h" - -#include - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - string path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.ino(); - if (!dir.frag.is_root()) out << "%" << dir.frag; - out << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " pv=" << dir.get_projected_version(); - out << " v=" << dir.get_version(); - out << " cv=" << dir.get_committing_version(); - out << "/" << dir.get_committed_version(); - out << "/" << dir.get_committed_version_equivalent(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { - if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) - out << " dir_auth=" << dir.get_dir_auth().first; - else - out << " dir_auth=" << dir.get_dir_auth(); - } - - if (dir.get_cum_auth_pins()) - out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; - //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; - if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; - if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - if (dir.get_num_dirty()) - out << " dirty=" << dir.get_num_dirty(); - - - if (dir.get_num_ref()) { - out << " |"; - dir.print_pin_set(out); - } - - out << " " << &dir; - return out << "]"; -} - - -void CDir::print(ostream& out) -{ - out << *this; -} - - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") " -//#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache." << *this << " " - - -ostream& CDir::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") "; -} - - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) -{ - inode = in; - frag = fg; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - num_dirty = 0; - - state = STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - committed_version = 0; - - // dir_auth - dir_auth = CDIR_AUTH_DEFAULT; - - // auth - assert(in->is_dir()); - if (auth) - state |= STATE_AUTH; - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - dir_rep = REP_NONE; - //dir_rep = REP_ALL; // hack: to wring out some bugs! FIXME FIXME -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_dentry( const string& dname, CInode *in) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, in); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - - if (in) { - link_inode_work( dn, in ); - } else { - assert(dn->inode == 0); - //null_items[dn->name] = dn; - nnull++; - } - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << endl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - // adjust dirty counter? - if (dn->state_test(CDentry::STATE_DIRTY)) - num_dirty--; - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_inode( CDentry *dn, inodeno_t ino) -{ - dout(12) << "link_inode " << *dn << " remote " << ino << endl; - - assert(dn->is_null()); - dn->set_remote_ino(ino); - nitems++; - - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - assert(nnull + nitems == items.size()); -} - -void CDir::link_inode( CDentry *dn, CInode *in ) -{ - dout(12) << "link_inode " << *dn << " " << *in << endl; - assert(!dn->is_remote()); - - link_inode_work(dn,in); - - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in ) -{ - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set inode version - //in->inode.version = dn->get_version(); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - if (dn->is_remote()) { - dout(12) << "unlink_inode " << *dn << endl; - } else { - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; - } - - unlink_inode_work(dn); - - // add to null list - //assert(null_items.count(dn->name) == 0); - //null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote_ino(0); - } else { - // primary - assert(dn->is_primary()); - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << endl; - - list dns; - for (CDir_map_t::iterator it = items.begin(); - it != items.end(); - it++) { - if (it->second->is_null()) - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - remove_dentry(dn); - } - //assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - -void CDir::try_remove_unlinked_dn(CDentry *dn) -{ - assert(dn->dir == this); - - if (dn->is_new() && dn->is_dirty() && - dn->get_num_ref() == 1) { - dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; - dn->mark_clean(); - remove_dentry(dn); - - if (version == projected_version && - committing_version == committed_version && - num_dirty == 0) { - dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version - << " vs committed " << committed_version - << endl; - committed_version_equivalent = committed_version; - } - } -} - - - - -CDirDiscover *CDir::replicate_to(int mds) -{ - assert(is_auth()); - return new CDirDiscover( this, add_replica(mds) ); -} - - - - - -/**************************************** - * WAITING - */ - -void CDir::add_dentry_waiter(const string& dname, Context *c) -{ - if (waiting_on_dentry.empty()) - get(PIN_DNWAITER); - waiting_on_dentry[dname].push_back(c); - dout(10) << "add_dentry_waiter dentry " << dname << " " << c << " on " << *this << endl; -} - -void CDir::take_dentry_waiting(const string& dname, list& ls) -{ - if (waiting_on_dentry.empty()) return; - if (waiting_on_dentry.count(dname) == 0) return; - dout(10) << "take_dentry_waiting dentry " << dname - << " x " << waiting_on_dentry[dname].size() - << " on " << *this << endl; - ls.splice(ls.end(), waiting_on_dentry[dname]); - waiting_on_dentry.erase(dname); - if (waiting_on_dentry.empty()) - put(PIN_DNWAITER); -} - - -void CDir::add_waiter(int tag, Context *c) -{ - // hierarchical? - - // at free root? - if (tag & WAIT_ATFREEZEROOT) { - if (!(is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir())) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // at subtree root? - if (tag & WAIT_ATSUBTREEROOT) { - if (!is_subtree_root()) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << endl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - MDSCacheObject::add_waiter(tag, c); -} - - - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, list& ls) -{ - if (mask & WAIT_DENTRY) { - // take each each dentry waiter - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_dentry_waiting((it++)->first, ls); // not post-inc - } - } - - // waiting - MDSCacheObject::take_waiting(mask, ls); -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << hex << mask << dec << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - //finish_contexts(finished, result); - cache->mds->queue_waiters(finished); -} - - - -// dirty/clean - -version_t CDir::pre_dirty(version_t min) -{ - if (min > projected_version) - projected_version = min; - ++projected_version; - dout(10) << "pre_dirty " << projected_version << endl; - return projected_version; -} - -void CDir::_mark_dirty() -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl; - get(PIN_DIRTY); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl; - } -} - -void CDir::mark_dirty(version_t pv) -{ - assert(version < pv); - version = pv; - _mark_dirty(); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << endl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIR); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIR); -} - - - -/****************************************************************************** - * FETCH and COMMIT - */ - -// ----------------------- -// FETCH - -class C_Dir_Fetch : public Context { - protected: - CDir *dir; - public: - bufferlist bl; - - C_Dir_Fetch(CDir *d) : dir(d) { } - void finish(int result) { - dir->_fetched(bl); - } -}; - -void CDir::fetch(Context *c) -{ - dout(10) << "fetch on " << *this << endl; - - assert(is_auth()); - assert(!is_complete()); - - if (c) add_waiter(WAIT_COMPLETE, c); - - // already fetching? - if (state_test(CDir::STATE_FETCHING)) { - dout(7) << "already fetching; waiting" << endl; - return; - } - - state_set(CDir::STATE_FETCHING); - - if (cache->mds->logger) cache->mds->logger->inc("fdir"); - - // start by reading the first hunk of it - C_Dir_Fetch *fin = new C_Dir_Fetch(this); - cache->mds->objecter->read( get_ondisk_object(), - 0, 0, // whole object - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - &fin->bl, - fin ); -} - -void CDir::_fetched(bufferlist &bl) -{ - dout(10) << "_fetched " << 0 << "~" << bl.length() - << " on " << *this - << endl; - - // give up? - if (!is_auth() || is_frozen()) { - dout(10) << "_fetched canceling (!auth or frozen)" << endl; - //ondisk_bl.clear(); - //ondisk_size = 0; - - // kick waiters? - state_clear(CDir::STATE_FETCHING); - finish_waiting(WAIT_COMPLETE, -1); - return; - } - - // decode. - int len = bl.length(); - int off = 0; - version_t got_version; - - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); - - dout(10) << "_fetched version " << got_version - << ", " << len << " bytes" - << endl; - - while (off < len) { - // marker - char type = bl[off]; - ++off; - - // dname - string dname; - ::_decode(dname, bl, off); - dout(24) << "_fetched parsed marker '" << type << "' dname '" << dname << "'" << endl; - - CDentry *dn = lookup(dname); // existing dentry? - - if (type == 'L') { - // hard link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - if (dn) { - if (dn->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << endl; - } else { - dout(12) << "_fetched had dentry " << *dn << endl; - } - } else { - // (remote) link - CDentry *dn = add_dentry( dname, ino ); - - // link to inode? - CInode *in = cache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "_fetched got remote link " << ino << " which we have " << *in << endl; - } else { - dout(12) << "_fetched got remote link " << ino << " (dont' have it)" << endl; - } - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - ::_decode(inode, bl, off); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - fragtree_t fragtree; - fragtree._decode(bl, off); - - if (dn) { - if (dn->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << endl; - } else { - dout(12) << "_fetched had dentry " << *dn << endl; - } - } else { - // add inode - CInode *in = 0; - if (cache->have_inode(inode.ino)) { - in = cache->get_inode(inode.ino); - dout(12) << "_fetched got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << endl; - assert(0); // this shouldn't happen!! - } else { - // inode - in = new CInode(cache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) - in->symlink = symlink; - - // dirfragtree - in->dirfragtree.swap(fragtree); - - // add - cache->add_inode( in ); - - // link - add_dentry( dname, in ); - dout(12) << "_fetched got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; - } - } - } else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << endl; - assert(0); - } - - /** clean underwater item? - * Underwater item is something that is dirty in our cache from - * journal replay, but was previously flushed to disk before the - * mds failed. - * - * We only do this is committed_version == 0. that implies either - * - this is a fetch after from a clean/empty CDir is created - * (and has no effect, since the dn won't exist); or - * - this is a fetch after _recovery_, which is what we're worried - * about. Items that are marked dirty from the journal should be - * marked clean if they appear on disk. - */ - if (committed_version == 0 && - dn && - dn->get_version() <= got_version && - dn->is_dirty()) { - dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << endl; - dn->mark_clean(); - - if (dn->get_inode()) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "_fetched had underwater inode " << *dn->get_inode() << ", marking clean" << endl; - dn->get_inode()->mark_clean(); - } - } - } - assert(off == len); - - // take the loaded version? - // only if we are a fresh CDir* with no prior state. - if (version == 0) { - assert(projected_version == 0); - assert(!state_test(STATE_COMMITTING)); - projected_version = version = committing_version = committed_version = got_version; - } - - // mark complete, !fetching - state_set(STATE_COMPLETE); - state_clear(STATE_FETCHING); - - // kick waiters - finish_waiting(WAIT_COMPLETE, 0); - /* - list waiters; - take_waiting(WAIT_COMPLETE, waiters); - cache->mds->queue_finished(waiters); - */ -} - - - -// ----------------------- -// COMMIT - -/** - * commit - * - * @param want min version i want committed - * @param c callback for completion - */ -void CDir::commit(version_t want, Context *c) -{ - dout(10) << "commit want " << want << " on " << *this << endl; - if (want == 0) want = version; - - // preconditions - assert(want <= version || version == 0); // can't commit the future - assert(committed_version < want); // the caller is stupid - assert(is_auth()); - assert(can_auth_pin()); - - // note: queue up a noop if necessary, so that we always - // get an auth_pin. - if (!c) - c = new C_NoopContext; - - // auth_pin on first waiter - if (waiting_for_commit.empty()) - auth_pin(); - waiting_for_commit[want].push_back(c); - - // ok. - _commit(want); -} - - -class C_Dir_RetryCommit : public Context { - CDir *dir; - version_t want; -public: - C_Dir_RetryCommit(CDir *d, version_t v) : - dir(d), want(v) { } - void finish(int r) { - dir->_commit(want); - } -}; - -class C_Dir_Committed : public Context { - CDir *dir; - version_t version; -public: - C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } - void finish(int r) { - dir->_committed(version); - } -}; - -void CDir::_commit(version_t want) -{ - dout(10) << "_commit want " << want << " on " << *this << endl; - - // we can't commit things in the future. - // (even the projected future.) - assert(want <= version || version == 0); - - // check pre+postconditions. - assert(is_auth()); - - // already committed? - if (committed_version >= want) { - dout(10) << "already committed " << committed_version << " >= " << want << endl; - return; - } - // already committing >= want? - if (committing_version >= want) { - dout(10) << "already committing " << committing_version << " >= " << want << endl; - assert(state_test(STATE_COMMITTING)); - return; - } - - // complete? - if (!is_complete()) { - dout(7) << "commit not complete, fetching first" << endl; - fetch(new C_Dir_RetryCommit(this, want)); - return; - } - - // commit. - committing_version = version; - - // mark committing (if not already) - if (!state_test(STATE_COMMITTING)) { - dout(10) << "marking committing" << endl; - state_set(STATE_COMMITTING); - } - - if (cache->mds->logger) cache->mds->logger->inc("cdir"); - - // encode dentries - bufferlist bl; - bl.append((char*)&version, sizeof(version)); - - for (CDir_map_t::iterator it = items.begin(); - it != items.end(); - it++) { - CDentry *dn = it->second; - - if (dn->is_null()) - continue; // skip negative entries - - // primary or remote? - if (dn->is_remote()) { - inodeno_t ino = dn->get_remote_ino(); - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' remote ino " << ino << endl; - - // marker, name, ino - bl.append( "L", 1 ); // remote link - ::_encode(it->first, bl); - ::_encode(ino, bl); - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' inode " << *in << endl; - - // marker, name, inode, [symlink string] - bl.append( "I", 1 ); // inode - ::_encode(it->first, bl); - ::_encode(in->inode, bl); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << endl; - ::_encode(in->symlink, bl); - } - - in->dirfragtree._encode(bl); - } - } - - // write it. - cache->mds->objecter->write( get_ondisk_object(), - 0, bl.length(), - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - bl, - NULL, new C_Dir_Committed(this, version) ); -} - - -/** - * _committed - * - * @param v version i just committed - */ -void CDir::_committed(version_t v) -{ - dout(10) << "_committed v " << v << " on " << *this << endl; - assert(is_auth()); - - // take note. - assert(v > committed_version); - assert(v <= committing_version); - committed_version = v; - - // _all_ commits done? - if (committing_version == committed_version) - state_clear(CDir::STATE_COMMITTING); - - // dir clean? - if (committed_version == version) - mark_clean(); - - // dentries clean? - for (CDir_map_t::iterator it = items.begin(); - it != items.end(); ) { - CDentry *dn = it->second; - it++; - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; - } - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; - assert(in->is_dirty()); - } - } - } - - // finishers? - bool were_waiters = !waiting_for_commit.empty(); - - map >::iterator p = waiting_for_commit.begin(); - while (p != waiting_for_commit.end()) { - map >::iterator n = p; - n++; - if (p->first > committed_version) break; // haven't committed this far yet. - cache->mds->queue_waiters(p->second); - waiting_for_commit.erase(p); - p = n; - } - - // unpin if we kicked the last waiter. - if (were_waiters && - waiting_for_commit.empty()) - auth_unpin(); -} - - - - - - -/******************************** - * AUTHORITY - */ - -/* - * if dir_auth.first == parent, auth is same as inode. - * unless .second != unknown, in which case that sticks. - */ -pair CDir::authority() -{ - if (is_subtree_root()) - return dir_auth; - else - return inode->authority(); -} - -/** is_subtree_root() - * true if this is an auth delegation point. - * that is, dir_auth != default (parent,unknown) - * - * some key observations: - * if i am auth: - * - any region bound will be an export, or frozen. - * - * note that this DOES heed dir_auth.pending - */ -bool CDir::is_subtree_root() -{ - if (dir_auth == CDIR_AUTH_DEFAULT) { - //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << endl; - return false; - } else { - //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << endl; - return true; - } -} - - - - -/** set_dir_auth - * - * always list ourselves first. - * - * accept 'iamauth' param so that i can intelligently adjust freeze auth_pins - * even when the auth bit isn't correct. - * as when calling MDCache::import_subtree(...). - */ -void CDir::set_dir_auth(pair a, bool iamauth) -{ - dout(10) << "setting dir_auth=" << a - << " from " << dir_auth - << " on " << *this << endl; - - bool was_subtree = is_subtree_root(); - bool was_ambiguous = dir_auth.second >= 0; - - // set it. - dir_auth = a; - - // new subtree root? - if (!was_subtree && is_subtree_root()) { - dout(10) << " new subtree root, adjusting auth_pins" << endl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(-get_cum_auth_pins()); - - // unpin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_unpin(); - } - if (was_subtree && !is_subtree_root()) { - dout(10) << " old subtree root, adjusting auth_pins" << endl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins()); - - // pin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_pin(); - } - - // newly single auth? - if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { - list ls; - take_waiting(WAIT_SINGLEAUTH, ls); - cache->mds->queue_waiters(ls); - } -} - - -/***************************************** - * AUTH PINS and FREEZING - * - * the basic plan is that auth_pins only exist in auth regions, and they - * prevent a freeze (and subsequent auth change). - * - * however, we also need to prevent a parent from freezing if a child is frozen. - * for that reason, the parent inode of a frozen directory is auth_pinned. - * - * the oddity is when the frozen directory is a subtree root. if that's the case, - * the parent inode isn't frozen. which means that when subtree authority is adjusted - * at the bounds, inodes for any frozen bound directories need to get auth_pins at that - * time. - * - */ - -void CDir::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - // nest pins? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->nested_auth_pins++; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CDir::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - assert(auth_pins >= 0); - - // pending freeze? - if (auth_pins + nested_auth_pins == 0) - on_freezeable(); - - // nest? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->nested_auth_pins--; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( -1 ); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - CDir *dir = this; - - // dir - dir->nested_auth_pins += inc; - - dout(10) << "adjust_nested_auth_pins " << inc << " on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; - assert(dir->nested_auth_pins >= 0); - - // pending freeze? - if (is_freezeable()) - dir->on_freezeable(); - // on freezeable_dir too? FIXME - - // adjust my inode? - if (dir->is_subtree_root()) - return; // no, stop. - - // yes. - dir->inode->adjust_nested_auth_pins(inc); -} - - - -/***************************************************************************** - * FREEZING - */ - -void CDir::on_freezeable() -{ - // check for anything pending freezeable - - /* NOTE: this will be called on deeper dirs first, walking up toward - the root, meaning that deeper freeze attempts will succeed first. - */ - /* NOTE: the first of these will likely freeze the dir, and unmark - FREEZING. additional ones will re-flag FREEZING. this isn't - particularly graceful, and might cause problems if the first one - needs to know about other waiters.... FIXME? */ - - finish_waiting(WAIT_FREEZEABLE); -} - -// FREEZE TREE - -class C_MDS_FreezeTree : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeTree(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_tree_finish(con); - } -}; - -void CDir::freeze_tree(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable()) { - dout(10) << "freeze_tree " << *this << endl; - _freeze_tree(c); - } else { - state_set(STATE_FREEZINGTREE); - dout(10) << "freeze_tree + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - } -} - -void CDir::freeze_tree_finish(Context *c) -{ - // still freezing? (we may have been canceled) - if (!is_freezing()) { - dout(10) << "freeze_tree_finish no longer freezing, done on " << *this << endl; - c->finish(-1); - delete c; - return; - } - - // freezeable now? - if (!is_freezeable()) { - // wait again! - dout(10) << "freeze_tree_finish still waiting " << *this << endl; - state_set(STATE_FREEZINGTREE); - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - return; - } - - dout(10) << "freeze_tree_finish " << *this << endl; - _freeze_tree(c); -} - -void CDir::_freeze_tree(Context *c) -{ - dout(10) << "_freeze_tree " << *this << endl; - - // there shouldn't be any conflicting auth_pins. - assert(is_freezeable_dir()); - - // twiddle state - state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? - state_set(STATE_FROZENTREE); - - // auth_pin inode for duration of freeze, if we are not a subtree root. - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << endl; - - if (state_test(STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(STATE_FROZENTREE); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - // waiters? - finish_waiting(WAIT_UNFREEZE); - } else { - // freezing. stop it. - assert(state_test(STATE_FREEZINGTREE)); - state_clear(STATE_FREEZINGTREE); - - // cancel freeze waiters - finish_waiting(WAIT_UNFREEZE); - finish_waiting(WAIT_FREEZEABLE, -1); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -CDir *CDir::get_frozen_tree_root() -{ - assert(is_frozen()); - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) - return dir; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - assert(0); - } -} - - - -// FREEZE DIR - -class C_MDS_FreezeDir : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeDir(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_dir_finish(con); - } -}; - -void CDir::freeze_dir(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable_dir()) { - dout(10) << "freeze_dir " << *this << endl; - _freeze_dir(c); - } else { - state_set(STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - } -} - -void CDir::_freeze_dir(Context *c) -{ - dout(10) << "_freeze_dir " << *this << endl; - - state_set(STATE_FROZENDIR); - - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); // auth_pin for duration of freeze - - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::freeze_dir_finish(Context *c) -{ - // freezeable now? - if (is_freezeable_dir()) { - // freeze now - _freeze_dir(c); - } else { - // wait again! - dout(10) << "freeze_dir_finish still waiting " << *this << endl; - state_set(STATE_FREEZINGDIR); - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - } -} - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << endl; - state_clear(STATE_FROZENDIR); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - // waiters? - finish_waiting(WAIT_UNFREEZE); -} - - - - - - - - diff --git a/branches/sage/pgs/mds/CDir.h b/branches/sage/pgs/mds/CDir.h deleted file mode 100644 index 6fef0fda4364b..0000000000000 --- a/branches/sage/pgs/mds/CDir.h +++ /dev/null @@ -1,576 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; -class CDirDiscover; - - - -ostream& operator<<(ostream& out, class CDir& dir); - - -// CDir -typedef map CDir_map_t; - - -//extern int cdir_pins[CDIR_NUM_PINS]; - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DNWAITER = 1; - static const int PIN_CHILD = 2; - static const int PIN_EXPORT = 4; - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; - static const int PIN_EXPORTING = 10; - static const int PIN_IMPORTBOUND = 11; - static const int PIN_EXPORTBOUND = 12; - static const int PIN_LOGGINGEXPORTFINISH = 17; - const char *pin_name(int p) { - switch (p) { - case PIN_DNWAITER: return "dnwaiter"; - case PIN_CHILD: return "child"; - case PIN_EXPORT: return "export"; - case PIN_EXPORTING: return "exporting"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTBOUND: return "importbound"; - case PIN_EXPORTBOUND: return "exportbound"; - case PIN_AUTHPIN: return "authpin"; - case PIN_LOGGINGEXPORTFINISH: return "loggingexportfinish"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const unsigned STATE_COMPLETE = (1<< 2); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 4); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 5); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 6); - static const unsigned STATE_FREEZINGDIR = (1<< 7); - static const unsigned STATE_COMMITTING = (1<< 8); // mid-commit - static const unsigned STATE_FETCHING = (1<< 9); // currenting fetching - static const unsigned STATE_DELETED = (1<<10); - //static const unsigned STATE_IMPORT = (1<<11); // flag set if this is an import. - static const unsigned STATE_EXPORT = (1<<12); - static const unsigned STATE_IMPORTBOUND = (1<<13); - static const unsigned STATE_EXPORTBOUND = (1<<14); - static const unsigned STATE_EXPORTING = (1<<15); - static const unsigned STATE_IMPORTING = (1<<16); - - // common states - static const unsigned STATE_CLEAN = 0; - static const unsigned STATE_INITIAL = 0; - - // these state bits are preserved by an import/export - // ...except if the directory is hashed, in which case none of them are! - static const unsigned MASK_STATE_EXPORTED = - STATE_COMPLETE|STATE_DIRTY; - static const unsigned MASK_STATE_IMPORT_KEPT = - //STATE_IMPORT| - STATE_EXPORT - |STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE; - static const unsigned MASK_STATE_EXPORT_KEPT = - STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_EXPORT; - - - // -- rep spec -- - static const int REP_NONE = 0; - static const int REP_ALL = 1; - static const int REP_LIST = 2; - - - static const int NONCE_EXPORT = 1; - - - // -- wait masks -- - static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache - static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FREEZEABLE = (1<<2); // hard_pins removed - static const int WAIT_UNFREEZE = WAIT_AUTHPINNABLE; // unfreeze - static const int WAIT_IMPORTED = (1<<3); // import finish - - static const int WAIT_DNLOCK_OFFSET = 4; - - static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE); - static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); - - - - - public: - // context - MDCache *cache; - - CInode *inode; // my inode - frag_t frag; // my frag - - bool is_lt(const MDSCacheObject *r) const { - return dirfrag() < ((const CDir*)r)->dirfrag(); - } - -protected: - // contents - CDir_map_t items; // non-null AND null - size_t nitems; // # non-null - size_t nnull; // # null - - int num_dirty; - - // state - version_t version; - version_t committing_version; - version_t committed_version; - version_t committed_version_equivalent; // in case of, e.g., temporary file - version_t projected_version; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == REP_LIST - - // popularity - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() const { return inode->ino(); } // deprecate me? - dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } - - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - CDir_map_t::iterator begin() { return items.begin(); } - CDir_map_t::iterator end() { return items.end(); } - size_t get_size() { - return nitems; - } - size_t get_nitems() { return nitems; } - size_t get_nnull() { return nnull; } - - void inc_num_dirty() { num_dirty++; } - void dec_num_dirty() { - assert(num_dirty > 0); - num_dirty--; - } - int get_num_dirty() { - return num_dirty; - } - - void try_remove_unlinked_dn(CDentry *dn); - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_dentry( const string& dname, CInode *in=0 ); - CDentry* add_dentry( const string& dname, inodeno_t ino ); - void remove_dentry( CDentry *dn ); // delete dentry - void link_inode( CDentry *dn, inodeno_t ino ); - void link_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - - void remove_null_dentries(); - - // -- authority -- - /* - * normal: !subtree_root - * delegation: subtree_root - * ambiguous: subtree_root - * subtree_root - */ - pair dir_auth; - - public: - pair authority(); - pair get_dir_auth() { return dir_auth; } - void set_dir_auth(pair a, bool iamauth=false); - void set_dir_auth(int a) { - set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN), false); - } - bool is_ambiguous_dir_auth() { - return dir_auth.second != CDIR_AUTH_UNKNOWN; - } - bool is_full_dir_auth() { - return is_auth() && !is_ambiguous_dir_auth(); - } - bool is_full_dir_nonauth() { - return !is_auth() && !is_ambiguous_dir_auth(); - } - - bool is_subtree_root(); - - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > - g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - CDirDiscover *replicate_to(int mds); - - - // -- state -- - bool is_complete() { return state & STATE_COMPLETE; } - bool is_exporting() { return state & STATE_EXPORTING; } - bool is_importing() { return state & STATE_IMPORTING; } - - bool is_rep() { - if (dir_rep == REP_NONE) return false; - return true; - } - - // -- fetch -- - object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c); - void _fetched(bufferlist &bl); - - // -- commit -- - map > waiting_for_commit; - - void commit_to(version_t want); - void commit(version_t want, Context *c); - void _commit(version_t want); - void _committed(version_t v); - void wait_for_commit(Context *c, version_t v=0); - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - version_t get_committing_version() { return committing_version; } - version_t get_committed_version() { return committed_version; } - version_t get_committed_version_equivalent() { return committed_version_equivalent; } - void set_committed_version(version_t v) { committed_version = v; } - - version_t pre_dirty(version_t min=0); - void _mark_dirty(); - void mark_dirty(version_t pv); - void mark_clean(); - void mark_complete() { state_set(STATE_COMPLETE); } - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- -protected: - hash_map< string, list > waiting_on_dentry; - -public: - bool is_waiting_for_dentry(const string& dn) { - return waiting_on_dentry.count(dn); - } - void add_dentry_waiter(const string& dentry, Context *c); - void take_dentry_waiting(const string& dentry, list& ls); - - void add_waiter(int mask, Context *c); - void take_waiting(int mask, list& ls); // may include dentry waiters - void finish_waiting(int mask, int result = 0); // ditto - - - // -- auth pins -- - bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } - int get_auth_pins() { return auth_pins; } - int get_nested_auth_pins() { return nested_auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - void on_freezeable(); - - // -- freezing -- - void freeze_tree(Context *c); - void freeze_tree_finish(Context *c); - void unfreeze_tree(); - void _freeze_tree(Context *c=0); - - void freeze_dir(Context *c); - void freeze_dir_finish(Context *c); - void _freeze_dir(Context *c=0); - void unfreeze_dir(); - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } - bool is_frozen_dir() { return state & STATE_FROZENDIR; } - - bool is_freezeable() { - // no nested auth pins. - if (auth_pins > 0 || nested_auth_pins > 0) - return false; - - // inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) - return false; - - return true; - } - bool is_freezeable_dir() { - if (auth_pins > 0) - return false; - - // if not subtree root, inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) - return false; - - return true; - } - - CDir *get_frozen_tree_root(); - - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - dirfrag_t dirfrag; - int nonce; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - dirfrag = dir->dirfrag(); - this->nonce = nonce; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == dirfrag); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - - - void _encode(bufferlist& bl) { - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - -// export - -class CDirExport { - struct { - dirfrag_t dirfrag; - uint32_t nden; // num dentries (including null ones) - version_t version; - version_t committed_version; - uint32_t state; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - int32_t dir_rep; - } st; - map replicas; - set rep_by; - - public: - CDirExport() {} - CDirExport(CDir *dir) { - memset(&st, 0, sizeof(st)); - - assert(dir->get_version() == dir->get_projected_version()); - - st.dirfrag = dir->dirfrag(); - st.nden = dir->items.size(); - st.version = dir->version; - st.committed_version = dir->committed_version; - st.state = dir->state; - st.dir_rep = dir->dir_rep; - - st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] ); - dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - rep_by = dir->dir_rep_by; - replicas = dir->replicas; - } - - dirfrag_t get_dirfrag() { return st.dirfrag; } - uint32_t get_nden() { return st.nden; } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == st.dirfrag); - - // set committed_version at old version - dir->committing_version = dir->committed_version = st.committed_version; - dir->projected_version = dir->version = st.version; - - // twiddle state - dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (st.state & CDir::MASK_STATE_EXPORTED); - dir->dir_rep = st.dir_rep; - - dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; - dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - dir->replica_nonce = 0; // no longer defined - - if (!dir->replicas.empty()) - dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replicas << endl; - - dir->dir_rep_by = rep_by; - dir->replicas = replicas; - dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl; - if (!replicas.empty()) - dir->get(CDir::PIN_REPLICATED); - if (dir->is_dirty()) { - dir->get(CDir::PIN_DIRTY); - } - } - - - void _encode(bufferlist& bl) { - bl.append((char*)&st, sizeof(st)); - ::_encode(replicas, bl); - ::_encode(rep_by, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(replicas, bl, off); - ::_decode(rep_by, bl, off); - return off; - } - -}; - - - -#endif diff --git a/branches/sage/pgs/mds/CInode.cc b/branches/sage/pgs/mds/CInode.cc deleted file mode 100644 index 5eb3f3f947785..0000000000000 --- a/branches/sage/pgs/mds/CInode.cc +++ /dev/null @@ -1,590 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "common/Clock.h" - -#include "messages/MLock.h" - -#include -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts -ostream& CInode::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; -} - - - -ostream& operator<<(ostream& out, CInode& in) -{ - string path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - - out << " v" << in.get_version(); - - // locks - out << " " << in.authlock; - out << " " << in.linklock; - out << " " << in.dirfragtreelock; - out << " " << in.filelock; - out << " " << in.dirlock; - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - - if (in.get_num_ref()) { - out << " |"; - in.print_pin_set(out); - } - - out << " " << ∈ - out << "]"; - return out; -} - - -void CInode::print(ostream& out) -{ - out << *this; -} - - -inode_t *CInode::project_inode() -{ - if (projected_inode.empty()) { - projected_inode.push_back(new inode_t(inode)); - } else { - projected_inode.push_back(new inode_t(*projected_inode.back())); - } - dout(15) << "project_inode " << projected_inode.back() << endl; - return projected_inode.back(); -} - -void CInode::pop_and_dirty_projected_inode() -{ - assert(!projected_inode.empty()); - dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() - << " v" << projected_inode.front()->version << endl; - mark_dirty(projected_inode.front()->version); - inode = *projected_inode.front(); - delete projected_inode.front(); - projected_inode.pop_front(); -} - - -// ====== CInode ======= - -// dirfrags - -frag_t CInode::pick_dirfrag(const string& dn) -{ - if (dirfragtree.empty()) - return frag_t(); // avoid the string hash if we can. - - static hash H; - return dirfragtree[H(dn)]; -} - -void CInode::get_dirfrags(list& ls) -{ - // all dirfrags - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - ls.push_back(p->second); -} -void CInode::get_nested_dirfrags(list& ls) -{ - // dirfrags in same subtree - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (!p->second->is_subtree_root()) - ls.push_back(p->second); -} -void CInode::get_subtree_dirfrags(list& ls) -{ - // dirfrags that are roots of new subtrees - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - ls.push_back(p->second); -} - - -CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) -{ - assert(is_dir()); - - // have it? - CDir *dir = get_dirfrag(fg); - if (dir) return dir; - - // create it. - assert(is_auth()); - dir = dirfrags[fg] = new CDir(this, fg, mdcache, true); - return dir; -} - -CDir *CInode::add_dirfrag(CDir *dir) -{ - assert(dirfrags.count(dir->dirfrag().frag) == 0); - dirfrags[dir->dirfrag().frag] = dir; - return dir; -} - -void CInode::close_dirfrag(frag_t fg) -{ - dout(14) << "close_dirfrag " << fg << endl; - assert(dirfrags.count(fg)); - - CDir *dir = dirfrags[fg]; - dir->remove_null_dentries(); - - // clear dirty flag - if (dir->is_dirty()) - dir->mark_clean(); - - // dump any remaining dentries, for debugging purposes - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << endl; - - assert(dir->get_num_ref() == 0); - delete dir; - dirfrags.erase(fg); -} - -void CInode::close_dirfrags() -{ - while (!dirfrags.empty()) - close_dirfrag(dirfrags.begin()->first); -} - -bool CInode::has_subtree_root_dirfrag() -{ - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - return true; - return false; -} - - - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) { - parent->put(CDentry::PIN_INODEPIN); - } - //if (num_parents == 0 && get_num_ref() == 0) - //mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - -/* -void CInode::get_parent() -{ - num_parents++; -} -void CInode::put_parent() -{ - num_parents--; - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} -*/ - -void CInode::add_remote_parent(CDentry *p) -{ - if (remote_parents.empty()) - get(PIN_REMOTEPARENT); - remote_parents.insert(p); -} -void CInode::remove_remote_parent(CDentry *p) -{ - remote_parents.erase(p); - if (remote_parents.empty()) - put(PIN_REMOTEPARENT); -} - - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - - - -void CInode::make_path(string& s) -{ - if (parent) { - parent->make_path(s); - } - else if (is_root()) { - s = ""; // root - } - else if (is_stray()) { - s = "~stray"; - char n[10]; - sprintf(n, "%d", (int)(ino()-MDS_INO_STRAY_OFFSET)); - s += n; - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - trace.push_back(Anchor(ino(), parent->dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << endl; - } - else - assert(is_root() || is_stray()); -} - -void CInode::name_stray_dentry(string& dname) -{ - char s[20]; - sprintf(s, "%ld", inode.ino.val); - dname = s; -} - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty() -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - } -} - -void CInode::mark_dirty(version_t pv) { - - dout(10) << "mark_dirty " << *this << endl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(); - - // mark dentry too - parent->mark_dirty(pv); -} - - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << endl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - } -} - - - -// ------------------ -// locking - -void CInode::set_object_info(MDSCacheObjectInfo &info) -{ - info.ino = ino(); -} - -void CInode::encode_lock_state(int type, bufferlist& bl) -{ - switch (type) { - case LOCK_OTYPE_IAUTH: - _encode(inode.ctime, bl); - _encode(inode.mode, bl); - _encode(inode.uid, bl); - _encode(inode.gid, bl); - break; - - case LOCK_OTYPE_ILINK: - _encode(inode.ctime, bl); - _encode(inode.nlink, bl); - _encode(inode.anchored, bl); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - dirfragtree._encode(bl); - break; - - case LOCK_OTYPE_IFILE: - _encode(inode.size, bl); - _encode(inode.mtime, bl); - _encode(inode.atime, bl); - break; - - case LOCK_OTYPE_IDIR: - _encode(inode.mtime, bl); - if (0) { - map dfsz; - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_auth()) - dfsz[p->first] = p->second->get_nitems(); - _encode(dfsz, bl); - } - break; - - default: - assert(0); - } -} - -void CInode::decode_lock_state(int type, bufferlist& bl) -{ - int off = 0; - utime_t tm; - - switch (type) { - case LOCK_OTYPE_IAUTH: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.mode, bl, off); - _decode(inode.uid, bl, off); - _decode(inode.gid, bl, off); - break; - - case LOCK_OTYPE_ILINK: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.nlink, bl, off); - _decode(inode.anchored, bl, off); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - dirfragtree._decode(bl, off); - break; - - case LOCK_OTYPE_IFILE: - _decode(inode.size, bl, off); - _decode(inode.mtime, bl, off); - _decode(inode.atime, bl, off); - break; - - case LOCK_OTYPE_IDIR: - //::_decode(inode.size, bl, off); - _decode(tm, bl, off); - if (inode.mtime < tm) { - inode.mtime = tm; - dirlock.set_updated(); - } - if (0) { - map dfsz; - ::_decode(dfsz, bl, off); - // hmm which to keep? - } - break; - - default: - assert(0); - } -} - - - - -// waiting - -bool CInode::is_frozen() -{ - if (parent && parent->dir->is_frozen()) - return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) - return true; - return false; -} - -bool CInode::is_freezing() -{ - if (parent && parent->dir->is_freezing()) - return true; - return false; -} - -void CInode::add_waiter(int tag, Context *c) -{ - // wait on the directory? - if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) { - parent->dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (parent) - return parent->dir->can_auth_pin(); - return true; -} - -void CInode::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - if (parent) - parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - assert(auth_pins >= 0); - - if (parent) - parent->dir->adjust_nested_auth_pins( -1 ); -} - -void CInode::adjust_nested_auth_pins(int a) -{ - if (!parent) return; - nested_auth_pins += a; - parent->get_dir()->adjust_nested_auth_pins(a); -} - - - -// authority - -pair CInode::authority() -{ - //if (is_root()) - //return CDIR_AUTH_ROOTINODE; // root _inode_ is locked to mds0. - if (force_auth.first >= 0) - return force_auth; - - if (parent) - return parent->dir->authority(); - - return CDIR_AUTH_UNDEF; -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - - diff --git a/branches/sage/pgs/mds/CInode.h b/branches/sage/pgs/mds/CInode.h deleted file mode 100644 index a64d2819d03b7..0000000000000 --- a/branches/sage/pgs/mds/CInode.h +++ /dev/null @@ -1,659 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "SimpleLock.h" -#include "FileLock.h" -#include "ScatterLock.h" -#include "LocalLock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; - - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - //static const int PIN_REPLICATED = 1; - static const int PIN_DIR = 2; - static const int PIN_CAPS = 7; // client caps - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = -9; // importing - static const int PIN_ANCHORING = 12; - static const int PIN_UNANCHORING = 13; - static const int PIN_OPENINGDIR = 14; - static const int PIN_REMOTEPARENT = 15; - static const int PIN_BATCHOPENJOURNAL = 16; - static const int PIN_SCATTERED = 17; - - const char *pin_name(int p) { - switch (p) { - case PIN_DIR: return "dir"; - case PIN_CAPS: return "caps"; - case PIN_AUTHPIN: return "authpin"; - case PIN_IMPORTING: return "importing"; - case PIN_ANCHORING: return "anchoring"; - case PIN_UNANCHORING: return "unanchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_REMOTEPARENT: return "remoteparent"; - case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; - case PIN_SCATTERED: return "scattered"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const int STATE_ROOT = (1<<1); - static const int STATE_EXPORTING = (1<<2); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<3); - static const int STATE_UNANCHORING = (1<<4); - static const int STATE_OPENINGDIR = (1<<5); - static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. - - // -- waiters -- - //static const int WAIT_SLAVEAGREE = (1<<0); - static const int WAIT_DIR = (1<<1); - static const int WAIT_ANCHORED = (1<<2); - static const int WAIT_UNANCHORED = (1<<3); - static const int WAIT_CAPS = (1<<4); - - static const int WAIT_AUTHLOCK_OFFSET = 5; - static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; - static const int WAIT_DIRFRAGTREELOCK_OFFSET = 5 + 2*SimpleLock::WAIT_BITS; - static const int WAIT_FILELOCK_OFFSET = 5 + 3*SimpleLock::WAIT_BITS; - static const int WAIT_DIRLOCK_OFFSET = 5 + 4*SimpleLock::WAIT_BITS; - static const int WAIT_VERSIONLOCK_OFFSET = 5 + 5*SimpleLock::WAIT_BITS; - - static const int WAIT_ANY = 0xffffffff; - - // misc - static const int EXPORT_NONCE = 1; // nonce given to replicas created by export - - ostream& print_db_line_prefix(ostream& out); - - public: - MDCache *mdcache; - - // inode contents proper - inode_t inode; // the inode itself - string symlink; // symlink dest, if symlink - fragtree_t dirfragtree; // dir frag tree, if any - map dirfrag_size; // size of each dirfrag - - off_t last_open_journaled; // log offset for the last journaled EOpen - - // projected values (only defined while dirty) - list projected_inode; - list projected_dirfragtree; - - version_t get_projected_version() { - if (projected_inode.empty()) - return inode.version; - else - return projected_inode.back()->version; - } - - inode_t *project_inode(); - void pop_and_dirty_projected_inode(); - - - // -- cache infrastructure -- - map dirfrags; // cached dir fragments - - frag_t pick_dirfrag(const string &dn); - bool has_dirfrags() { return !dirfrags.empty(); } - CDir* get_dirfrag(frag_t fg) { - if (dirfrags.count(fg)) - return dirfrags[fg]; - else - return 0; - } - void get_dirfrags(list& ls); - void get_nested_dirfrags(list& ls); - void get_subtree_dirfrags(list& ls); - CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); - CDir *add_dirfrag(CDir *dir); - void close_dirfrag(frag_t fg); - void close_dirfrags(); - bool has_subtree_root_dirfrag(); - - protected: - // parent dentries in cache - CDentry *parent; // primary link - set remote_parents; // if hard linked - - pair force_auth; - - // -- distributed state -- -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - private: - // auth pin - int auth_pins; - int nested_auth_pins; - - public: - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true) : - mdcache(c), - last_open_journaled(0), - parent(0), force_auth(CDIR_AUTH_DEFAULT), - replica_caps_wanted(0), - auth_pins(0), nested_auth_pins(0), - versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET), - authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), - linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET), - dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET), - filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET), - dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET) - { - state = 0; - if (auth) state_set(STATE_AUTH); - }; - ~CInode() { - close_dirfrags(); - } - - - // -- accessors -- - bool is_file() { return inode.is_file(); } - bool is_symlink() { return inode.is_symlink(); } - bool is_dir() { return inode.is_dir(); } - - bool is_anchored() { return inode.anchored; } - bool is_anchoring() { return state_test(STATE_ANCHORING); } - bool is_unanchoring() { return state_test(STATE_UNANCHORING); } - - bool is_root() { return state & STATE_ROOT; } - bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } - - - inodeno_t ino() const { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - - bool is_lt(const MDSCacheObject *r) const { - return ino() < ((CInode*)r)->ino(); - } - - - - // -- misc -- - void make_path(string& s); - void make_anchor_trace(vector& trace); - void name_stray_dentry(string& dname); - - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - void add_waiter(int tag, Context *c); - - - // -- locks -- -public: - LocalLock versionlock; - SimpleLock authlock; - SimpleLock linklock; - SimpleLock dirfragtreelock; - FileLock filelock; - ScatterLock dirlock; - - SimpleLock* get_lock(int type) { - switch (type) { - case LOCK_OTYPE_IFILE: return &filelock; - case LOCK_OTYPE_IAUTH: return &authlock; - case LOCK_OTYPE_ILINK: return &linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; - case LOCK_OTYPE_IDIR: return &dirlock; - default: assert(0); - } - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - - // -- caps -- (new) - // client caps - bool is_any_caps() { return !client_caps.empty(); } - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - void reconnect_cap(int client, inode_caps_reconnect_t& icr) { - Capability *cap = get_client_cap(client); - if (cap) { - cap->merge(icr.wanted, icr.issued); - } else { - Capability newcap(icr.wanted, 0); - newcap.issue(icr.issued); - add_client_cap(client, newcap); - } - inode.size = MAX(inode.size, icr.size); - inode.mtime = MAX(inode.mtime, icr.mtime); - inode.atime = MAX(inode.atime, icr.atime); - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void take_client_caps(map& cl) { - if (!client_caps.empty()) - put(PIN_CAPS); - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - cl[it->first] = it->second.make_export(); - } - client_caps.clear(); - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = Capability(it->second); - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - dout(10) << " relaxing locks on " << *this << endl; - assert(is_auth()); - assert(!is_replicated()); - - authlock.replicate_relax(); - linklock.replicate_relax(); - dirfragtreelock.replicate_relax(); - - if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0) - filelock.replicate_relax(); - - dirlock.replicate_relax(); - } - - - // -- authority -- - pair authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - void adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - - // -- reference counting -- - void bad_put(int by) { - dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - void bad_get(int by) { - dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 0); - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - } - void add_remote_parent(CDentry *p); - void remove_remote_parent(CDentry *p); - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - ls = cached_by; - } - } - */ - - void print(ostream& out); - -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - string symlink; - fragtree_t dirfragtree; - - int replica_nonce; - - int authlock_state; - int linklock_state; - int dirfragtreelock_state; - int filelock_state; - int dirlock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - replica_nonce = nonce; - - authlock_state = in->authlock.get_replica_state(); - linklock_state = in->linklock.get_replica_state(); - dirfragtreelock_state = in->dirfragtreelock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - dirlock_state = in->dirlock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - in->replica_nonce = replica_nonce; - } - void init_inode_locks(CInode *in) { - in->authlock.set_state(authlock_state); - in->linklock.set_state(linklock_state); - in->dirfragtreelock.set_state(dirfragtreelock_state); - in->filelock.set_state(filelock_state); - in->dirlock.set_state(dirlock_state); - } - - void _encode(bufferlist& bl) { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replica_nonce, bl); - ::_encode(authlock_state, bl); - ::_encode(linklock_state, bl); - ::_encode(dirfragtreelock_state, bl); - ::_encode(filelock_state, bl); - ::_encode(dirlock_state, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(authlock_state, bl, off); - ::_decode(linklock_state, bl, off); - ::_decode(dirfragtreelock_state, bl, off); - ::_decode(filelock_state, bl, off); - ::_decode(dirlock_state, bl, off); - } - -}; - - -// export - -class CInodeExport { - - struct st_ { - inode_t inode; - - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - bool is_dirty; // dirty inode? - - int num_caps; - } st; - - string symlink; - fragtree_t dirfragtree; - - map replicas; - map cap_map; - - bufferlist locks; - -public: - CInodeExport() {} - CInodeExport(CInode *in) { - st.inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - st.is_dirty = in->is_dirty(); - replicas = in->replicas; - - in->authlock._encode(locks); - in->linklock._encode(locks); - in->dirfragtreelock._encode(locks); - in->filelock._encode(locks); - in->dirlock._encode(locks); - - st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); - in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - // steal WRITER caps from inode - in->take_client_caps(cap_map); - //remaining_issued = in->get_caps_issued(); - } - - inodeno_t get_ino() { return st.inode.ino; } - - void update_inode(CInode *in, set& new_client_caps) { - // treat scatterlocked mtime special, since replica may have newer info - if (in->dirlock.get_state() == LOCK_SCATTER || - in->dirlock.get_state() == LOCK_GLOCKC || - in->dirlock.get_state() == LOCK_GTEMPSYNCC) - st.inode.mtime = MAX(in->inode.mtime, st.inode.mtime); - - in->inode = st.inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - - in->popularity[MDS_POP_JUSTME] += st.popularity_justme; - in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - in->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - if (st.is_dirty) - in->_mark_dirty(); - - in->replicas = replicas; - if (!replicas.empty()) - in->get(CInode::PIN_REPLICATED); - - int off = 0; - in->authlock._decode(locks, off); - in->linklock._decode(locks, off); - in->dirfragtreelock._decode(locks, off); - in->filelock._decode(locks, off); - in->dirlock._decode(locks, off); - - // caps - in->merge_client_caps(cap_map, new_client_caps); - } - - void _encode(bufferlist& bl) { - st.num_caps = cap_map.size(); - - ::_encode(st, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replicas, bl); - ::_encode(locks, bl); - ::_encode(cap_map, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - ::_decode(st, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replicas, bl, off); - ::_decode(locks, bl, off); - ::_decode(cap_map, bl, off); - - return off; - } -}; - - - -#endif diff --git a/branches/sage/pgs/mds/Capability.h b/branches/sage/pgs/mds/Capability.h deleted file mode 100644 index eab6aa84b08bc..0000000000000 --- a/branches/sage/pgs/mds/Capability.h +++ /dev/null @@ -1,246 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - -typedef uint32_t capseq_t; - -class Capability { -public: - struct Export { - int wanted; - int issued; - int pending; - Export() {} - Export(int w, int i, int p) : wanted(w), issued(i), pending(p) {} - }; - -private: - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - capseq_t last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0, capseq_t s=0) : - wanted_caps(want), - last_sent(s), - last_recv(s), - suppress(false) { - //cap_history[last_sent] = 0; - } - Capability(Export& other) : - wanted_caps(other.wanted), - last_sent(0), last_recv(0) { - // issued vs pending - if (other.issued & ~other.pending) - issue(other.issued); - issue(other.pending); - } - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty() && wanted_caps == 0; } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (capseq_t seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - capseq_t issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - capseq_t get_last_seq() { return last_sent; } - - Export make_export() { - return Export(wanted_caps, issued(), pending()); - } - void merge(Export& other) { - // issued + pending - int newpending = other.pending | pending(); - if (other.issued & ~newpending) - issue(other.issued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted; - } - void merge(int otherwanted, int otherissued) { - // issued + pending - int newpending = pending(); - if (otherissued & ~newpending) - issue(otherissued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | otherwanted; - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(capseq_t seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/branches/sage/pgs/mds/ClientMap.cc b/branches/sage/pgs/mds/ClientMap.cc deleted file mode 100644 index f2a791d7d5f0b..0000000000000 --- a/branches/sage/pgs/mds/ClientMap.cc +++ /dev/null @@ -1,121 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "include/types.h" - -#include "MDS.h" -#include "ClientMap.h" - -#include "osdc/Filer.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".clientmap " - - - -void ClientMap::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_CLIENTMAP_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -// ---------------- -// LOAD - -class C_CM_Load : public Context { - ClientMap *clientmap; -public: - bufferlist bl; - C_CM_Load(ClientMap *cm) : clientmap(cm) {} - void finish(int r) { - clientmap->_load_finish(bl); - } -}; - -void ClientMap::load(Context *onload) -{ - dout(10) << "load" << endl; - - init_inode(); - - if (onload) - waiting_for_load.push_back(onload); - - C_CM_Load *c = new C_CM_Load(this); - mds->filer->read(inode, - 0, inode.layout.stripe_unit, - &c->bl, - c); - -} - -void ClientMap::_load_finish(bufferlist &bl) -{ - int off = 0; - decode(bl, off); - dout(10) << "_load_finish v " << version - << ", " << client_inst.size() << " clients, " - << bl.length() << " bytes" - << endl; - projected = committing = committed = version; - finish_contexts(waiting_for_load); -} - - -// ---------------- -// SAVE - -class C_CM_Save : public Context { - ClientMap *clientmap; - version_t version; -public: - C_CM_Save(ClientMap *cm, version_t v) : clientmap(cm), version(v) {} - void finish(int r) { - clientmap->_save_finish(version); - } -}; - -void ClientMap::save(Context *onsave, version_t needv) -{ - dout(10) << "save needv " << needv << ", v " << version << endl; - commit_waiters[version].push_back(onsave); - - if (needv && committing >= needv) return; - - bufferlist bl; - - init_inode(); - encode(bl); - committing = version; - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_CM_Save(this, version)); -} - -void ClientMap::_save_finish(version_t v) -{ - dout(10) << "_save_finish v" << v << endl; - committed = v; - - finish_contexts(commit_waiters[v]); - commit_waiters.erase(v); -} diff --git a/branches/sage/pgs/mds/ClientMap.h b/branches/sage/pgs/mds/ClientMap.h deleted file mode 100644 index 6fa68e207f5a4..0000000000000 --- a/branches/sage/pgs/mds/ClientMap.h +++ /dev/null @@ -1,188 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "mdstypes.h" - -class MDS; - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * for clients with an active session. - * - * it is also used to keep track of recently completed - * operations, should the client have to resubmit them - * (after a connection failure, etc.) - */ -class ClientMap { -private: - MDS *mds; - - version_t version; - version_t projected; - version_t committing; - version_t committed; - map > commit_waiters; - -public: - ClientMap(MDS *m) : mds(m), - version(0), projected(0), committing(0), committed(0) {} - - version_t get_version() { return version; } - version_t get_projected() { return projected; } - version_t get_committing() { return committing; } - version_t get_committed() { return committed; } - - version_t inc_projected() { return ++projected; } - void reset_projected() { projected = version; } - void set_committing(version_t v) { committing = v; } - void set_committed(version_t v) { committed = v; } - -private: - // affects version - hash_map client_inst; - - // does not affect version - set sessions; - set opening; - set closing; - -public: - bool empty() { - return client_inst.empty(); - } - - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_session_set() { return sessions; } - - bool is_opening(int c) { return opening.count(c); } - void add_opening(int c) { opening.insert(c); } - bool is_closing(int c) { return closing.count(c); } - void add_closing(int c) { closing.insert(c); } - bool have_session(int client) { - return client_inst.count(client); - } - void open_session(const entity_inst_t& inst) { - opening.erase(inst.name.num()); - client_inst[inst.name.num()] = inst; - sessions.insert(inst.name.num()); - version++; - } - void close_session(int client) { - closing.erase(client); - sessions.erase(client); - client_inst.erase(client); - version++; - } - -private: - // -- push sequence -- - hash_map client_push_seq; // seq # for messages pushed to client. - -public: - version_t inc_push_seq(int client) { - return ++client_push_seq[client]; - } - version_t get_push_seq(int client) { - return client_push_seq[client]; - } - - -private: - // -- completed requests -- - // client id -> tid -> result code - map > completed_requests; // completed client requests - map > waiting_for_trim; - -public: - void add_completed_request(metareqid_t ri) { - completed_requests[ri.client].insert(ri.tid); - } - void trim_completed_requests(int client, - tid_t mintid) { // zero means trim all! - map >::iterator p = completed_requests.find(client); - if (p == completed_requests.end()) - return; - - // trim - while (!p->second.empty() && (mintid == 0 || *p->second.begin() < mintid)) - p->second.erase(p->second.begin()); - if (p->second.empty()) - completed_requests.erase(p); - - // kick waiters - map >::iterator q = waiting_for_trim.find(client); - if (q != waiting_for_trim.end()) { - list fls; - while (!q->second.empty() && - (mintid == 0 || q->second.begin()->first < mintid)) { - fls.push_back(q->second.begin()->second); - q->second.erase(q->second.begin()); - } - if (q->second.empty()) - waiting_for_trim.erase(q); - finish_contexts(fls); - } - } - void add_trim_waiter(metareqid_t ri, Context *c) { - waiting_for_trim[ri.client][ri.tid] = c; - } - bool have_completed_request(metareqid_t ri) { - return completed_requests.count(ri.client) && - completed_requests[ri.client].count(ri.tid); - } - - - // -- encoding -- - void encode(bufferlist& bl) { - bl.append((char*)&version, sizeof(version)); - ::_encode(client_inst, bl); - ::_encode(sessions, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(client_inst, bl, off); - ::_decode(sessions, bl, off); - - projected = committing = committed = version; - } - - // -- loading, saving -- - inode_t inode; - list waiting_for_load; - - void init_inode(); - void load(Context *onload); - void _load_finish(bufferlist &bl); - void save(Context *onsave, version_t needv=0); - void _save_finish(version_t v); -}; - -#endif diff --git a/branches/sage/pgs/mds/FileLock.h b/branches/sage/pgs/mds/FileLock.h deleted file mode 100644 index adb2130e86541..0000000000000 --- a/branches/sage/pgs/mds/FileLock.h +++ /dev/null @@ -1,227 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILELOCK_H -#define __FILELOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "SimpleLock.h" -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// -----auth-------- ---replica------- -#define LOCK_SYNC_ 1 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_GSYNCL -12 // A . . / C ? . . . L loner -> sync (*) -#define LOCK_GSYNCM -13 // A . . / . R . . . L - -#define LOCK_LOCK_ 2 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR_ -3 // AR R . / C . . . . . . . / C . . . . . -#define LOCK_GLOCKL -4 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM -5 // A . . / . . . . . . - -#define LOCK_MIXED 6 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR -7 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL -8 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 9 // A . . / C R W A B L (lock) -#define LOCK_GLONERR -10 // A . . / . R . . . L -#define LOCK_GLONERM -11 // A . . / . R W A . L - -// (*) FIXME: how to let old loner keep R, somehow, during GSYNCL - -// 4 stable -// +9 transition -// 13 total - -inline const char *get_filelock_state_name(int n) { - switch (n) { - case LOCK_SYNC: return "sync"; - case LOCK_GSYNCL: return "gsyncl"; - case LOCK_GSYNCM: return "gsyncm"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_GLOCKL: return "glockl"; - case LOCK_GLOCKM: return "glockm"; - case LOCK_MIXED: return "mixed"; - case LOCK_GMIXEDR: return "gmixedr"; - case LOCK_GMIXEDL: return "gmixedl"; - case LOCK_LONER: return "loner"; - case LOCK_GLONERR: return "glonerr"; - case LOCK_GLONERM: return "glonerm"; - default: assert(0); - } -} - - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class MDRequest; - -class FileLock : public SimpleLock { - public: - FileLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) { } - - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // read/write access - bool can_rdlock(MDRequest *mdr) { - if (!parent->is_auth()) return (state == LOCK_SYNC); - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; - if (state == LOCK_LOCK && !xlock_by) return true; - return - (state == LOCK_SYNC) || - (state == LOCK_GMIXEDR) || - (state == LOCK_GLOCKR); - } - bool can_rdlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKL); - else - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever() { - if (parent->is_auth()) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed() { - if (parent->is_auth()) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_filelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/pgs/mds/Hasher.cc b/branches/sage/pgs/mds/Hasher.cc deleted file mode 100644 index 308aaa0dc976c..0000000000000 --- a/branches/sage/pgs/mds/Hasher.cc +++ /dev/null @@ -1,1582 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - - -// ======================================================================= -// HASHING - - -void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) -{ - int off = 0; - - for (; nden>0; nden--) { - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dn dirty _after_ we link the inode (scroll down) - - if (icode == 'N') { - - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - - // fix up subdir export? - if (dn->inode->dir) { - assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTBOUND)); - dn->inode->dir->put(CDir::PIN_IMPORTBOUND); - dn->inode->dir->state_clear(CDIR_STATE_IMPORTBOUND); - - if (dn->inode->dir->is_auth()) { - // mine. must have been an import. - assert(dn->inode->dir->is_import()); - dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDir::PIN_IMPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORT); - - // move nested under hashdir - for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); - it != cache->nested_exports[dn->inode->dir].end(); - it++) - cache->nested_exports[dir].insert(*it); - cache->nested_exports.erase(dn->inode->dir); - - // now it matches the inode - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - else { - // not mine. make it an export. - dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDir::PIN_EXPORT); - dn->inode->dir->state_set(CDIR_STATE_EXPORT); - cache->exports.insert(dn->inode->dir); - cache->nested_exports[dir].insert(dn->inode->dir); - - if (dn->inode->dir->get_dir_auth().first == CDIR_AUTH_PARENT) - dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode - assert(dn->inode->dir->get_dir_auth().first >= 0); - } - } - } - - // mark dentry dirty? (only _after_ we link the inode!) - dn->_mark_dirty(); // fixme - } -} - -/* - - notes on interaction of hashing and export/import: - - - dir->is_auth() is completely independent of hashing. for a hashed dir, - - all nodes are partially authoritative - - all nodes dir->is_hashed() == true - - all nodes dir->inode->dir_is_hashed() == true - - one node dir->is_auth() == true, the rest == false - - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. - - - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - - on export, -1 is changed to mds->get_nodeid() - - on import, nothing special, actually. - - - hashed dir files aren't included in export; subdirs are converted to imports - or exports as necessary. - - hashed dir subdirs are discovered on export. this is important - because dirs are needed to tie together auth hierarchy, for auth to know about - imports/exports, etc. - - - dir state is maintained on auth. - - COMPLETE and HASHED are transfered to importers. - - DIRTY is set everywhere. - - - hashed dir is like an import: hashed dir used for nested_exports map. - - nested_exports is updated appropriately on auth and replicas. - - a subtree terminates as a hashed dir, since the hashing explicitly - redelegates all inodes. thus export_dir_walk includes hashed dirs, but - not their inodes. -*/ - -// HASH on auth - -class C_MDC_HashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->hash_dir_frozen(dir); - } -}; - -class C_MDC_HashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashComplete(Migrator *mig, CDir *dir) { - this->mig = mig; - this->dir = dir; - } - virtual void finish(int r) { - mig->hash_dir_complete(dir); - } -}; - - -/** hash_dir(dir) - * start hashing a directory. - */ -void Migrator::hash_dir(CDir *dir) -{ - dout(-7) << "hash_dir " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't hash, freezing|frozen." << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "hash_dir couldn't pin path, failing." << endl; - return; - } - - // ok, go - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // discover on all mds - assert(hash_gather.count(dir) == 0); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; // except me - hash_gather[dir].insert(i); - mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); - } - dir->auth_pin(); // pin until discovers are all acked. - - // start freeze - dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_HashComplete(this, dir)); - } else - hash_dir_complete(dir); -} - - -/* - * wait for everybody to discover and open the hashing dir - * then auth_unpin, to let the freeze happen - */ -void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete - } else { - dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; - } - - delete m; // done -} - - - -/* - * once the dir is completely in memory, - * mark all migrating inodes dirty (to pin in cache) - */ -void Migrator::hash_dir_complete(CDir *dir) -{ - dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - in->_mark_dirty(); // fixme - } - - if (dir->is_frozen_dir()) - hash_dir_go(dir); -} - - -/* - * once the dir is frozen, - * make sure it's complete - * send the prep messages! - */ -void Migrator::hash_dir_frozen(CDir *dir) -{ - dout(7) << "hash_dir_frozen " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; - return; - } - - // send prep messages w/ export directories to open - vector msgs(mds->get_mds_map()->get_num_mds()); - - // check for subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) continue; - - // msg? - if (msgs[dentryhashcode] == 0) { - msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); - } - msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); - } - - // send them! - assert(hash_gather[dir].empty()); - for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - hash_gather[dir].insert(i); - } - } - - if (hash_gather[dir].empty()) { - // no subdirs! continue! - hash_gather.erase(dir); - hash_dir_go(dir); - } else { - // wait! - } -} - -/* - * wait for peers to open all subdirs - */ -void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; - hash_dir_go(dir); - } else { - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * once the dir is frozen, - * make sure it's complete - * do the hashing! - */ -void Migrator::hash_dir_go(CDir *dir) -{ - dout(7) << "hash_dir_go " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - // get messages to other nodes ready - vector msgs(mds->get_mds_map()->get_num_mds()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - msgs[i] = new MHashDir(dir->ino()); - } - - // pick a hash seed. - dir->inode->inode.hash_seed = 1;//dir->ino(); - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // get containing import. might be me. - CDir *containing_import = cache->get_auth_container(dir); - assert(containing_import != dir || dir->is_import()); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) { - continue; // still mine! - } - - bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); - assert(bl); - - // -- dentry - dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, *bl); - - // null dentry? - if (dn->is_null()) { - bl->append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl->append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl->append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl->append("I", 1); // inode dentry - - encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export - msgs[dentryhashcode]->inc_nden(); - - if (dn->is_dirty()) - dn->mark_clean(); - - // add to proxy - hash_proxy_inos[dir].push_back(in); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // fix up subdirs - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - - // fix nested bits - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == in->dir) { - dout(10) << "moving nested export " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[in->dir].insert(ex); - } - } - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[containing_import].erase(in->dir); - if (in->dir->authority() == dentryhashcode) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( in->dir->authority() ); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // inode state - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // fix up nested_exports? - if (containing_import != dir) { - dout(7) << "moving nested exports under hashed dir" << endl; - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == dir) { - dout(7) << " moving nested export under hashed dir: " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[dir].insert(ex); - } else { - dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; - } - } - } - - // send hash messages - assert(hash_gather[dir].empty()); - assert(hash_notify_gather[dir].empty()); - assert(dir->hashed_subset.empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - // all nodes hashed locally.. - dir->hashed_subset.insert(i); - - if (i == mds->get_nodeid()) continue; - - // init hash_gather and hash_notify_gather sets - hash_gather[dir].insert(i); - - assert(hash_notify_gather[dir][i].empty()); - for (int j=0; jget_mds_map()->get_num_mds(); j++) { - if (j == mds->get_nodeid()) continue; - if (j == i) continue; - hash_notify_gather[dir][i].insert(j); - } - - mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - } - - // wait for all the acks. -} - - -void Migrator::handle_hash_dir_ack(MHashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; - - if (hash_notify_gather[dir].empty()) { - dout(7) << "got notifies too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "waiting on notifies " << endl; - } - - } else { - dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -void Migrator::hash_dir_finish(CDir *dir) -{ - dout(7) << "hash_dir_finish finishing " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - // dir state - hash_gather.erase(dir); - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - - // unproxy inodes - // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - hash_proxy_inos.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unfreeze - dir->unfreeze_dir(); - - show_imports(); - assert(hash_gather.count(dir) == 0); - - // stats - //if (mds->logger) mds->logger->inc("nh", 1); - -} - - - - -// HASH on auth and non-auth - -void Migrator::handle_hash_dir_notify(MHashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir_notify " << *dir << endl; - int from = m->get_from(); - - int source = m->get_source().num(); - if (dir->is_auth()) { - // gather notifies - assert(dir->is_hashed()); - - assert( hash_notify_gather[dir][from].count(source) ); - hash_notify_gather[dir][from].erase(source); - - if (hash_notify_gather[dir][from].empty()) { - dout(7) << "last notify from " << from << endl; - hash_notify_gather[dir].erase(from); - - if (hash_notify_gather[dir].empty()) { - dout(7) << "last notify!" << endl; - hash_notify_gather.erase(dir); - - if (hash_gather[dir].empty()) { - dout(7) << "got acks too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; - } - } else { - dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; - } - } else { - dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; - } - - // delete msg - delete m; - } else { - // update dir hashed_subset - assert(dir->hashed_subset.count(from) == 0); - dir->hashed_subset.insert(from); - - // update open subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - if (!in) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != from) continue; // we'll import these in a minute - - if (in->dir->authority() != dentryhashcode) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - - // remove from notify gather set - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - // last notify? - if (hash_gather[dir].empty()) { - dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; - hash_gather.erase(dir); - - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - } else { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - } - - // fw notify to auth - mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); - } -} - - - - -// HASH on non-auth - -/* - * discover step: - * each peer needs to open up the directory and pin it before we start - */ -class C_MDC_HashDirDiscover : public Context { - Migrator *mig; - MHashDirDiscover *m; -public: - vector trace; - C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { - this->mig = mig; - this->m = m; - } - void finish(int r) { - CInode *in = 0; - if (r >= 0) { - if (trace.size()) - in = trace[trace.size()-1]->get_inode(); - else - in = mig->cache->get_root(); - } - mig->handle_hash_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - assert(in->is_dir()); - - // is dir open? - if (!in->dir) { - dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - CDir *dir = in->dir; - - // pin dir, set hashing flag - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // inode state - dir->inode->inode.hash_seed = 1;// dir->ino(); - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // get gather set ready for notifies - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == dir->authority()) continue; - hash_gather[dir].insert(i); - } - - // reply - dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; - mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - -/* - * prep step: - * peers need to open up all subdirs of the hashed dir - */ - -void Migrator::handle_hash_dir_prep(MHashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_hash_dir_prep " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTBOUND); - in->dir->state_set(CDIR_STATE_IMPORTBOUND); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ack! - mds->send_message_mds(new MHashDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; -} - - -/* - * hash step: - */ - -void Migrator::handle_hash_dir(MHashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(!dir->is_auth()); - assert(!dir->is_hashed()); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir " << *dir << endl; - int oldauth = m->get_source().num(); - - // content - import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->hashed_subset.insert(mds->get_nodeid()); - - // dir is complete - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // commit - mds->mdstore->commit_dir(dir, 0); - - // send notifies - dout(7) << "sending notifies" << endl; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == m->get_source().num()) continue; - mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), - i, MDS_PORT_MIGRATOR); - } - - // ack - dout(7) << "acking" << endl; - mds->send_message_mds(new MHashDirAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; - - show_imports(); -} - - - - - -// UNHASH on auth - -class C_MDC_UnhashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_frozen(dir); - } -}; - -class C_MDC_UnhashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_complete(dir); - } -}; - - -void Migrator::unhash_dir(CDir *dir) -{ - dout(-7) << "unhash_dir " << *dir << endl; - - assert(dir->is_hashed()); - assert(!dir->is_unhashing()); - assert(dir->is_auth()); - assert(hash_gather.count(dir)==0); - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "unhash_dir couldn't pin path, failing." << endl; - return; - } - - // twiddle state - dir->state_set(CDIR_STATE_UNHASHING); - - // first, freeze the dir. - dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else - unhash_dir_complete(dir); - -} - -void Migrator::unhash_dir_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep(dir); -} - - -/* - * ask peers to freeze and complete hashed dir - */ -void Migrator::unhash_dir_prep(CDir *dir) -{ - dout(7) << "unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDirPrep(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * wait for peers to freeze and complete hashed dirs - */ -void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTBOUND); - in->dir->state_set(CDIR_STATE_IMPORTBOUND); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ok, done with this PrepAck - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; - unhash_dir_go(dir); - } else { - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * auth: - * send out MHashDir's to peers - */ -void Migrator::unhash_dir_go(CDir *dir) -{ - dout(7) << "unhash_dir_go " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDir(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * auth: - * assimilate unhashing content - */ -void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(dir->is_hashed()); - - // assimilate content - int from = m->get_source().num(); - import_hashed_content(dir, m->get_state(), m->get_nden(), from); - delete m; - - // done? - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; - return; - } - - // done! - - // fix up nested_exports - CDir *containing_import = cache->get_auth_container(dir); - if (containing_import != dir) { - for (set::iterator it = cache->nested_exports[dir].begin(); - it != cache->nested_exports[dir].end(); - it++) { - dout(7) << "moving nested export out from under hashed dir : " << **it << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(dir); - } - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); //later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - - // commit! - assert(dir->is_complete()); - //dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdstore->commit_dir(dir, 0); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); - } - - // notify - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - - hash_gather[dir].insert(i); - - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - - -/* - * sent by peer to flush mds links. unfreeze when all gathered. - */ -void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(dir->is_frozen_dir()); - - // done? - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - delete m; - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; - } else { - unhash_dir_finish(dir); - } -} - - -/* - * all mds links are flushed. unfreeze dir! - */ -void Migrator::unhash_dir_finish(CDir *dir) -{ - dout(7) << "unhash_dir_finish " << *dir << endl; - hash_gather.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // state - dir->state_clear(CDIR_STATE_UNHASHING); - - // unfreeze - dir->unfreeze_dir(); - -} - - - -// UNHASH on all - -/* - * hashed dir is complete. - * mark all migrating inodes dirty (to pin in cache) - * if frozen too, then go to next step (depending on auth) - */ -void Migrator::unhash_dir_complete(CDir *dir) -{ - dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(dir->is_hashed()); - assert(dir->is_complete()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - if (in->is_auth()) { - in->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash dirty fixme")); - } - } - - if (!dir->is_frozen_dir()) { - dout(7) << "dir complete but !frozen, waiting " << *dir << endl; - } else { - if (dir->is_auth()) - unhash_dir_prep(dir); // auth - else - unhash_dir_prep_finish(dir); // nonauth - } -} - - -// UNHASH on non-auth - -class C_MDC_UnhashPrepFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_prep_frozen(dir); - } -}; - - -/* - * peers need to freeze their dir and make them complete - */ -void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - - // freeze - dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else { - unhash_dir_complete(dir); - } - - delete m; -} - -/* - * peer has hashed dir frozen. - * complete too? - */ -void Migrator::unhash_dir_prep_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_prep_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_frozen_dir()); - assert(!dir->is_auth()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep_finish(dir); -} - -/* - * peer has hashed dir complete and frozen. ack. - */ -void Migrator::unhash_dir_prep_finish(CDir *dir) -{ - dout(7) << "unhash_dir_prep_finish " << *dir << endl; - assert(dir->is_hashed()); - assert(!dir->is_auth()); - assert(dir->is_frozen()); - assert(dir->is_complete()); - - // twiddle state - if (dir->is_unhashing()) - return; // already replied. - dir->state_set(CDIR_STATE_UNHASHING); - - // send subdirs back to auth - MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); - int auth = dir->authority(); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) continue; - - // msg? - ack->add_inode(it->first, in->replicate_to(auth)); - } - - // ack - mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); -} - - - -/* - * peer needs to send hashed dir content back to auth. - * unhash dir. - */ -void Migrator::handle_unhash_dir(MUnhashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; - assert(dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - // get message ready - bufferlist bl; - int nden = 0; - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) { - // not mine! - // twiddle dir_auth? - if (in->dir) { - if (in->dir->authority() != dir->authority()) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - continue; - } - - // -- dentry - dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, bl); - - // null dentry? - if (dn->is_null()) { - bl.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl.append("I", 1); // inode dentry - - encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export - nden++; - - if (dn->is_dirty()) - dn->mark_clean(); - - // proxy - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - hash_proxy_inos[dir].push_back(in); - - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[dir].erase(in->dir); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // we should have no nested exports; we're not auth for the dir! - assert(cache->nested_exports[dir].empty()); - cache->nested_exports.erase(dir); - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); // later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - dir->mark_clean(); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); - } - - // init gather set - mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); - hash_gather[dir].erase(mds->get_nodeid()); - - // send unhash message - mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), - dir->authority(), MDS_PORT_MIGRATOR); -} - - -/* - * first notify comes from auth. - * send notifies to all other peers, with peer = self - * if we get notify from peer=other, remove from our gather list. - * when we've gotten notifies from everyone, - * unpin proxies, - * send notify_ack to auth. - * this ensures that all mds links are flushed of cache_expire type messages. - */ -void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_finish " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - delete m; - - // did we send our shout out? - if (from == dir->authority()) { - // send notify to everyone else in weird chatter storm - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == from) continue; - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); - } - } - - // are we done? - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - return; - } - hash_gather.erase(dir); - - // all done! - dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; - - // unpin proxies - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // unfreeze - dir->unfreeze_dir(); - - // ack - dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; - mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - -} diff --git a/branches/sage/pgs/mds/IdAllocator.cc b/branches/sage/pgs/mds/IdAllocator.cc deleted file mode 100644 index e5ddae7cc3e6e..0000000000000 --- a/branches/sage/pgs/mds/IdAllocator.cc +++ /dev/null @@ -1,198 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -inodeno_t IdAllocator::alloc_id() -{ - assert(is_active()); - - // pick one - inodeno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << endl; - - version++; - - // log it - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - */ - - return id; -} - -void IdAllocator::reclaim_id(inodeno_t id) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << endl; - free.insert(id); - - version++; - - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); - */ -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << endl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << endl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << endl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - // use generic range. FIXME THIS IS CRAP - free.clear(); - free.insert((uint64_t)0x10000000000 * (uint64_t)(mds->get_nodeid()+1), - (uint64_t)0x10000000000 * (uint64_t)(mds->get_nodeid()+2) - (uint64_t)1); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << endl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.stripe_unit, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << endl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << endl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/branches/sage/pgs/mds/IdAllocator.h b/branches/sage/pgs/mds/IdAllocator.h deleted file mode 100644 index e8a0f5436938f..0000000000000 --- a/branches/sage/pgs/mds/IdAllocator.h +++ /dev/null @@ -1,77 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - inodeno_t alloc_id(); - void reclaim_id(inodeno_t ino); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/branches/sage/pgs/mds/LocalLock.h b/branches/sage/pgs/mds/LocalLock.h deleted file mode 100644 index 752fdcb4d3fd1..0000000000000 --- a/branches/sage/pgs/mds/LocalLock.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCALLOCK_H -#define __LOCALLOCK_H - -#include "SimpleLock.h" - -class LocalLock : public SimpleLock { -protected: - int num_wrlock; - -public: - LocalLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0) { - set_state(LOCK_LOCK); // always. - } - - bool can_wrlock() { - return !is_xlocked(); - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/pgs/mds/Locker.cc b/branches/sage/pgs/mds/Locker.cc deleted file mode 100644 index 8b29ac5a77723..0000000000000 --- a/branches/sage/pgs/mds/Locker.cc +++ /dev/null @@ -1,2781 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(SimpleLock *lock, int msg) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - -void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - m->set_data(data); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - - - - - - - - - - -bool Locker::acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks) -{ - if (mdr->done_locking) { - dout(10) << "acquire_locks " << *mdr << " -- done locking" << endl; - return true; // at least we had better be! - } - dout(10) << "acquire_locks " << *mdr << endl; - - set sorted; // sort everything we will lock - set mustpin = xlocks; // items to authpin - - // xlocks - for (set::iterator p = xlocks.begin(); p != xlocks.end(); ++p) { - dout(20) << " must xlock " << **p << " " << *(*p)->get_parent() << endl; - sorted.insert(*p); - - // augment xlock with a versionlock? - if ((*p)->get_type() > LOCK_OTYPE_IVERSION) { - // inode version lock? - CInode *in = (CInode*)(*p)->get_parent(); - if (mdr->is_master()) { - // master. wrlock versionlock so we can pipeline inode updates to journal. - wrlocks.insert(&in->versionlock); - } else { - // slave. exclusively lock the inode version (i.e. block other journal updates) - xlocks.insert(&in->versionlock); - sorted.insert(&in->versionlock); - } - } - } - - // wrlocks - for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) { - dout(20) << " must wrlock " << **p << " " << *(*p)->get_parent() << endl; - sorted.insert(*p); - if ((*p)->get_parent()->is_auth()) - mustpin.insert(*p); - else if ((*p)->get_type() == LOCK_OTYPE_IDIR && - !(*p)->get_parent()->is_auth() && !((ScatterLock*)(*p))->can_wrlock()) { // we might have to request a scatter - dout(15) << " will also auth_pin " << *(*p)->get_parent() << " in case we need to request a scatter" << endl; - mustpin.insert(*p); - } - } - - // rdlocks - for (set::iterator p = rdlocks.begin(); - p != rdlocks.end(); - ++p) { - dout(20) << " must rdlock " << **p << " " << *(*p)->get_parent() << endl; - sorted.insert(*p); - } - - - // AUTH PINS - map > mustpin_remote; // mds -> (object set) - - // can i auth pin them all now? - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - - dout(10) << " must authpin " << *object << endl; - - if (mdr->is_auth_pinned(object)) - continue; - - if (!object->is_auth()) { - if (object->is_ambiguous_auth()) { - // wait - dout(10) << " ambiguous auth, waiting to authpin " << *object << endl; - object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - mustpin_remote[object->authority().first].insert(object); - continue; - } - if (!object->can_auth_pin()) { - // wait - dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << endl; - object->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - } - - // ok, grab local auth pins - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - if (mdr->is_auth_pinned(object)) { - dout(10) << " already auth_pinned " << *object << endl; - } else if (object->is_auth()) { - dout(10) << " auth_pinning " << *object << endl; - mdr->auth_pin(object); - } - } - - // request remote auth_pins - if (!mustpin_remote.empty()) { - for (map >::iterator p = mustpin_remote.begin(); - p != mustpin_remote.end(); - ++p) { - dout(10) << "requesting remote auth_pins from mds" << p->first << endl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPIN); - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " req remote auth_pin of " << **q << endl; - MDSCacheObjectInfo info; - (*q)->set_object_info(info); - req->get_authpins().push_back(info); - } - mds->send_message_mds(req, p->first, MDS_PORT_SERVER); - - // put in waiting list - assert(mdr->waiting_on_slave.count(p->first) == 0); - mdr->waiting_on_slave.insert(p->first); - } - return false; - } - - // acquire locks. - // make sure they match currently acquired locks. - set::iterator existing = mdr->locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - - // already locked? - if (existing != mdr->locks.end() && *existing == *p) { - // right kind? - SimpleLock *have = *existing; - existing++; - if (xlocks.count(*p) && mdr->xlocks.count(*p)) { - dout(10) << " already xlocked " << *have << " " << *have->get_parent() << endl; - } - else if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { - dout(10) << " already wrlocked " << *have << " " << *have->get_parent() << endl; - } - else if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { - dout(10) << " already rdlocked " << *have << " " << *have->get_parent() << endl; - } - else assert(0); - continue; - } - - // hose any stray locks - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << endl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - // lock - if (xlocks.count(*p)) { - if (!xlock_start(*p, mdr)) - return false; - dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << endl; - } else if (wrlocks.count(*p)) { - if (!wrlock_start(*p, mdr)) - return false; - dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << endl; - } else { - if (!rdlock_start(*p, mdr)) - return false; - dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << endl; - } - } - - // any extra unneeded locks? - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << endl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - return true; -} - - -void Locker::drop_locks(MDRequest *mdr) -{ - // leftover locks - while (!mdr->xlocks.empty()) - xlock_finish(*mdr->xlocks.begin(), mdr); - while (!mdr->rdlocks.empty()) - rdlock_finish(*mdr->rdlocks.begin(), mdr); - while (!mdr->wrlocks.empty()) - wrlock_finish(*mdr->wrlocks.begin(), mdr); -} - - -// generics - -bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIR: - return scatter_rdlock_start((ScatterLock*)lock, mdr); - default: - return simple_rdlock_start(lock, mdr); - } -} - -void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIR: - return scatter_rdlock_finish((ScatterLock*)lock, mdr); - default: - return simple_rdlock_finish(lock, mdr); - } -} - -bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIR: - return scatter_wrlock_start((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_start((LocalLock*)lock, mdr); - default: - assert(0); - } -} - -void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIR: - return scatter_wrlock_finish((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_finish((LocalLock*)lock, mdr); - default: - assert(0); - } -} - -bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_start((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_start(lock, mdr); - } -} - -void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_finish((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_finish(lock, mdr); - } -} - - - -/** rejoin_set_state - * @lock the lock - * @s the new state - * @waiters list for anybody waiting on this lock - */ -void Locker::rejoin_set_state(SimpleLock *lock, int s, list& waiters) -{ - if (!lock->is_stable()) { - lock->set_state(s); - lock->get_parent()->auth_unpin(); - } else { - lock->set_state(s); - } - lock->take_waiting(SimpleLock::WAIT_ALL, waiters); -} - - - - -// file i/o ----------------------------------------- - -version_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << endl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl; - - // my needs - int my_client = req->get_client(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // suppress file cap messages for new cap (we'll bundle with the open() reply) - cap->set_suppress(true); - } else { - // make sure it has sufficient caps - if (cap->wanted() & ~my_want) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - if (in->filelock.is_stable()) - file_eval(&in->filelock); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << endl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.pending() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << endl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl; - mds->send_message_client(new MClientFileCaps(MClientFileCaps::OP_GRANT, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - it->first); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << endl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << endl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - int auth = in->authority().first; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << endl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - // ok - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth()); - - if (mds->is_rejoin() && - in->is_rejoining()) { - dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << endl; - delete m; - return; - } - - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - if (in->filelock.is_stable()) - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << endl; - - // update wanted - if (cap->wanted() != wanted) - cap->set_wanted(wanted); - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl; - in->remove_client_cap(client); - if (!in->is_auth()) - request_inode_file_caps(in); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(MClientFileCaps::OP_RELEASE, - in->inode, - 0, 0, 0); - mds->send_message_client(r, m->get_source_inst()); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << endl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << endl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << endl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - if (!in->filelock.is_stable()) - file_eval_gather(&in->filelock); - else - file_eval(&in->filelock); - - //in->finish_waiting(CInode::WAIT_CAPS, 0); // note: any users for this? - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) -{ - switch (lock_type) { - case LOCK_OTYPE_DN: - { - CDir *dir = mdcache->get_dirfrag(info.dirfrag); - CDentry *dn = 0; - if (dir) - dn = dir->lookup(info.dname); - if (!dn) { - dout(7) << "get_lock don't have dn " << info.dirfrag << " " << info.dname << endl; - return 0; - } - return &dn->lock; - } - - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IFILE: - case LOCK_OTYPE_IDIR: - { - CInode *in = mdcache->get_inode(info.ino); - if (!in) { - dout(7) << "get_lock don't have ino " << info.ino << endl; - return 0; - } - switch (lock_type) { - case LOCK_OTYPE_IAUTH: return &in->authlock; - case LOCK_OTYPE_ILINK: return &in->linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &in->dirfragtreelock; - case LOCK_OTYPE_IFILE: return &in->filelock; - case LOCK_OTYPE_IDIR: return &in->dirlock; - } - } - - default: - dout(7) << "get_lock don't know lock_type " << lock_type << endl; - assert(0); - break; - } - - return 0; -} - - -void Locker::handle_lock(MLock *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); - if (!lock) { - dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << endl; - delete m; - return; - } - - switch (lock->get_type()) { - case LOCK_OTYPE_DN: - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: - handle_simple_lock(lock, m); - break; - - case LOCK_OTYPE_IFILE: - handle_file_lock((FileLock*)lock, m); - break; - - case LOCK_OTYPE_IDIR: - handle_scatter_lock((ScatterLock*)lock, m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_lock_type() << endl; - assert(0); - break; - } -} - - - - - -// ========================================================================== -// simple lock - -void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) -{ - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << endl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - - // special case: trim replica no-longer-null dentry? - if (lock->get_type() == LOCK_OTYPE_DN) { - CDentry *dn = (CDentry*)lock->get_parent(); - if (dn->is_null() && m->get_data().length() > 0) { - dout(10) << "handle_simple_lock replica dentry null -> non-null, must trim " - << *dn << endl; - assert(dn->get_num_ref() == 0); - map expiremap; - mdcache->trim_dentry(dn, expiremap); - mdcache->send_expire_messages(expiremap); - } - } - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->is_rdlocked()) { - dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_GLOCKR); - } else { - // update lock and reply - lock->set_state(LOCK_LOCK); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - from, MDS_PORT_LOCKER); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", still gathering " << lock->get_gather_set() << endl; - } else { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", last one" << endl; - simple_eval_gather(lock); - } - break; - - } - - delete m; -} - -/* unused, currently. - -class C_Locker_SimpleEval : public Context { - Locker *locker; - SimpleLock *lock; -public: - C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_simple_eval(lock); - } -}; - -void Locker::try_simple_eval(SimpleLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << endl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (lock->is_stable()) - simple_eval(lock); -} -*/ - -void Locker::simple_eval_gather(SimpleLock *lock) -{ - dout(10) << "simple_eval_gather " << *lock << " on " << *lock->get_parent() << endl; - - // finished gathering? - if (lock->get_state() == LOCK_GLOCKR && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << endl; - - // replica: tell auth - if (!lock->get_parent()->is_auth()) { - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); - } - - lock->set_state(LOCK_LOCK); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); - - if (lock->get_parent()->is_auth()) { - lock->get_parent()->auth_unpin(); - - // re-eval? - simple_eval(lock); - } - } -} - -void Locker::simple_eval(SimpleLock *lock) -{ - dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << endl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // stable -> sync? - if (!lock->is_xlocked() && - lock->get_state() != LOCK_SYNC && - !lock->is_waiter_for(SimpleLock::WAIT_WR)) { - dout(7) << "simple_eval stable, syncing " << *lock - << " on " << *lock->get_parent() << endl; - simple_sync(lock); - } - -} - - -// mid - -void Locker::simple_sync(SimpleLock *lock) -{ - dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_SYNC) - return; // already sync - assert(lock->get_state() == LOCK_LOCK); - - // sync. - if (lock->get_parent()->is_replicated()) { - // hard data - bufferlist data; - lock->encode_locked_state(data); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - // change lock - lock->set_state(LOCK_SYNC); - - // waiters? - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); -} - -void Locker::simple_lock(SimpleLock *lock) -{ - dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_LOCK) return; - assert(lock->get_state() == LOCK_SYNC); - - if (lock->get_parent()->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->init_gather(); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LOCK); - } -} - - -// top - -bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) -{ - dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << endl; - - // can read? grab ref. - if (lock->can_rdlock(0)) - return true; - - assert(!lock->get_parent()->is_auth()); - - // wait! - dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_RD, con); - return false; -} - -bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << endl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait! - dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; -} - -void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - // drop ref - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - - // last one? - if (!lock->is_rdlocked()) - simple_eval_gather(lock); -} - -bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; - - // xlock by me? - if (lock->is_xlocked() && - lock->get_xlocked_by() == mdr) - return true; - - // auth? - if (lock->get_parent()->is_auth()) { - // auth - - // lock. - if (lock->get_state() == LOCK_SYNC) - simple_lock(lock); - - // already locked? - if (lock->get_state() == LOCK_LOCK) { - if (lock->is_xlocked()) { - // by someone else. - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // xlock. - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - // wait for lock - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - // this had better not be a remote xlock attempt! - assert(!mdr->slave_request); - - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // send lock request - int auth = lock->get_parent()->authority().first; - mdr->slaves.insert(auth); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, auth, MDS_PORT_SERVER); - - // wait - lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - assert(mdr); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - // remote xlock? - if (!lock->get_parent()->is_auth()) { - // tell auth - dout(7) << "simple_xlock_finish releasing remote xlock on " << *lock->get_parent() << endl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNXLOCK); - slavereq->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(slavereq->get_object_info()); - mds->send_message_mds(slavereq, auth, MDS_PORT_SERVER); - } - } - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - // eval? - if (lock->get_parent()->is_auth()) - simple_eval(lock); -} - - - -// dentry specific helpers - -// trace helpers - -/** dentry_can_rdlock_trace - * see if we can _anonymously_ rdlock an entire trace. - * if not, and req is specified, wait and retry that message. - */ -bool Locker::dentry_can_rdlock_trace(vector& trace) -{ - // verify dentries are rdlockable. - // we do this because - // - we're being less aggressive about locks acquisition, and - // - we're not acquiring the locks in order! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->lock.can_rdlock(0)) { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << endl; - return false; - } - } - return true; -} - -void Locker::dentry_anon_rdlock_trace_start(vector& trace) -{ - // grab dentry rdlocks - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - dout(10) << "dentry_anon_rdlock_trace_start rdlocking " << (*it)->lock << " " << **it << endl; - (*it)->lock.get_rdlock(); - } -} - - -void Locker::dentry_anon_rdlock_trace_finish(vector& trace) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - simple_rdlock_finish(&(*it)->lock, 0); -} - - - -// ========================================================================== -// scatter lock - -bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_start on " << *lock - << " on " << *lock->get_parent() << endl; - - // read on stable scattered replica? - if (lock->get_state() == LOCK_SCATTER && - !lock->get_parent()->is_auth()) { - dout(7) << "scatter_rdlock_start scatterlock read on a stable scattered replica, fw to auth" << endl; - mdcache->request_forward(mdr, lock->get_parent()->authority().first); - return false; - } - - // pre-twiddle? - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_wrlocked()) - scatter_sync(lock); - - // can rdlock? - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for read. - lock->add_waiter(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate sync or tempsync? - if (lock->is_stable() && - lock->get_parent()->is_auth()) { - if (lock->get_parent()->is_replicated()) - scatter_tempsync(lock); - else - scatter_sync(lock); - } - - return false; -} - -void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_finish on " << *lock - << " on " << *lock->get_parent() << endl; - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_start on " << *lock - << " on " << *lock->get_parent() << endl; - - // pre-twiddle? - if (lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() == LOCK_SYNC) - scatter_lock(lock); - - // can wrlock? - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate scatter or lock? - if (lock->is_stable()) { - if (lock->get_parent()->is_auth()) { - // auth. scatter or lock? - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) - scatter_scatter(lock); - else - scatter_lock(lock); - } else { - // replica. - // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). - int auth = lock->get_parent()->authority().first; - dout(10) << "requesting scatter from auth on " - << *lock << " on " << *lock->get_parent() << endl; - mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - } - } - - return false; -} - -void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << endl; - lock->put_wrlock(); - if (mdr) { - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -class C_Locker_ScatterEval : public Context { - Locker *locker; - ScatterLock *lock; -public: - C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_scatter_eval(lock); - } -}; - - -void Locker::try_scatter_eval(ScatterLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_scatter_eval not auth for " << *lock->get_parent() << endl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (lock->is_stable()) - scatter_eval(lock); -} - - -void Locker::scatter_eval_gather(ScatterLock *lock) -{ - dout(10) << "scatter_eval_gather " << *lock << " on " << *lock->get_parent() << endl; - - if (!lock->get_parent()->is_auth()) { - // REPLICA - - if (lock->get_state() == LOCK_GLOCKC && - !lock->is_wrlocked()) { - dout(10) << "scatter_eval no wrlocks, acking lock" << endl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - auth, MDS_PORT_LOCKER); - } - lock->set_state(LOCK_LOCK); - //lock->get_parent()->put(CInode::PIN_SCATTERED); - } - - } else { - // AUTH - - // glocks|glockt -> lock? - if ((lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKT) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished lock gather/un-rdlock on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // glockc -> lock? - else if (lock->get_state() == LOCK_GLOCKC && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_LOCK); - //lock->get_parent()->put(CInode::PIN_SCATTERED); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - // gSyncL -> sync? - else if (lock->get_state() == LOCK_GSYNCL && - !lock->is_wrlocked()) { - dout(7) << "scatter_eval finished sync un-wrlock on " << *lock - << " on " << *lock->get_parent() << endl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gscattert|gscatters -> scatter? - else if ((lock->get_state() == LOCK_GSCATTERT || - lock->get_state() == LOCK_GSCATTERS) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock - << " on " << *lock->get_parent() << endl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gTempsyncC|gTempsyncL -> tempsync - else if ((lock->get_state() == LOCK_GTEMPSYNCC || - lock->get_state() == LOCK_GTEMPSYNCL) && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - - // re-eval? - if (lock->is_stable()) // && lock->get_parent()->can_auth_pin()) - scatter_eval(lock); - } -} - -void Locker::scatter_writebehind(ScatterLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind on " << *lock << " on " << *in << endl; - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate("dir.mtime writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock)); -} - -void Locker::scatter_writebehind_finish(ScatterLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << endl; - in->pop_and_dirty_projected_inode(); - lock->clear_updated(); - scatter_eval_gather(lock); -} - -void Locker::scatter_eval(ScatterLock *lock) -{ - dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << endl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) { - // i _should_ be scattered. - if (!lock->is_rdlocked() && - !lock->is_xlocked()) { - dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << endl; - scatter_scatter(lock); - } - } else { - // i _should_ be sync. - if (!lock->is_wrlocked() && - !lock->is_xlocked()) { - dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << endl; - scatter_sync(lock); - } - } -} - - -void Locker::scatter_sync(ScatterLock *lock) -{ - dout(10) << "scatter_sync " << *lock - << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - return; // already sync. - - case LOCK_TEMPSYNC: - break; // just do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || lock->is_xlocked()) { - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - // lock first. this is the slow way, incidentally. - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } else { - if (!lock->is_wrlocked()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); - break; // do it now, we're fine - } - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - default: - assert(0); - } - - // do sync - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_scatter(ScatterLock *lock) -{ - dout(10) << "scatter_scatter " << *lock - << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GSCATTERS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - if (lock->is_xlocked()) - return; // sorry - break; // do it. - - case LOCK_SCATTER: - return; // did it. - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GSCATTERT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it - - default: - assert(0); - } - - // do scatter - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_lock(ScatterLock *lock) -{ - dout(10) << "scatter_lock " << *lock - << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it. - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - return; // done. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GLOCKT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - } - - // do lock - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_tempsync(ScatterLock *lock) -{ - dout(10) << "scatter_tempsync " << *lock - << " on " << *lock->get_parent() << endl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - break; // do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || - lock->is_xlocked()) { - lock->set_state(LOCK_GTEMPSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GTEMPSYNCC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - return; // done - } - - // do tempsync - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - - - - - - -void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) -{ - int from = m->get_asker(); - dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << endl; - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << endl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - - lock->set_state(LOCK_SYNC); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SCATTER || - lock->get_state() == LOCK_SYNC); - - // wait for wrlocks to close? - if (lock->is_wrlocked()) { - assert(lock->get_state() == LOCK_SCATTER); - dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_GLOCKC); - } else if (lock->is_rdlocked()) { - assert(lock->get_state() == LOCK_SYNC); - dout(7) << "handle_scatter_lock has rdlocks, waiting on " << *lock - << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_GLOCKS); - } else { - //if (lock->get_state() == LOCK_SCATTER) - //lock->get_parent()->put(CInode::PIN_SCATTERED); - - // encode and reply - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - from, MDS_PORT_LOCKER); - lock->set_state(LOCK_LOCK); - } - break; - - case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - break; - - // -- for auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKC || - lock->get_state() == LOCK_GSCATTERS); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - lock->decode_locked_state(m->get_data()); - - if (lock->is_gathering()) { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", still gathering " << lock->get_gather_set() - << endl; - } else { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", last one" - << endl; - scatter_eval_gather(lock); - } - break; - - case LOCK_AC_REQSCATTER: - if (lock->is_stable()) { - dout(7) << "handle_scatter_lock got scatter request on " << *lock << " on " << *lock->get_parent() - << endl; - scatter_scatter(lock); - } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock << " on " << *lock->get_parent() - << endl; - } - break; - - } - - delete m; -} - - - - - -// ========================================================================== -// local lock - - -bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_start on " << *lock - << " on " << *lock->get_parent() << endl; - - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - -void Locker::local_wrlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << endl; - lock->put_wrlock(); - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); -} - -bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_start on " << *lock - << " on " << *lock->get_parent() << endl; - - if (lock->is_xlocked_by_other(mdr)) { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; -} - -void Locker::local_xlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_finish on " << *lock - << " on " << *lock->get_parent() << endl; - lock->put_xlock(); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); -} - - - -// ========================================================================== -// file lock - - -bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << endl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // can't read, and replicated. - if (lock->can_rdlock_soon()) { - // wait - dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << endl; - } else { - if (lock->get_parent()->is_auth()) { - // auth - - // FIXME or qsync? - - if (lock->is_stable()) { - file_lock(lock); // lock, bc easiest to back off ... FIXME - - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - return true; - } - } else { - dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - if (lock->is_stable()) { - - // fw to auth - CInode *in = (CInode*)lock->get_parent(); - int auth = in->authority().first; - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } - } - - // wait - dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - - return false; -} - - - -void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - - // drop ref - lock->put_rdlock(); - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - - if (!lock->is_rdlocked()) - file_eval_gather(lock); -} - - -bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; - - assert(lock->get_parent()->is_auth()); // remote file xlock not implemented - - // already xlocked by me? - if (lock->get_xlocked_by() == mdr) - return true; - - // can't write? - if (!lock->can_xlock(mdr)) { - - // auth - if (!lock->can_xlock_soon()) { - if (!lock->is_stable()) { - dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // initiate lock - file_lock(lock); - - // fall-thru to below. - } - } - - // check again - if (lock->can_xlock(mdr)) { - assert(lock->get_parent()->is_auth()); - lock->get_xlock(mdr); - mdr->locks.insert(lock); - mdr->xlocks.insert(lock); - return true; - } else { - dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << endl; - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - mdr->locks.erase(lock); - mdr->xlocks.erase(lock); - - assert(lock->get_parent()->is_auth()); // or implement remote xlocks - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - if (lock->get_parent()->is_auth()) - file_eval(lock); -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ -class C_Locker_FileEval : public Context { - Locker *locker; - FileLock *lock; -public: - C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_file_eval(lock); - } -}; - -void Locker::try_file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - - // unstable and ambiguous auth? - if (!lock->is_stable() && - in->is_ambiguous_auth()) { - dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_file_eval not auth for " << *lock->get_parent() << endl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_Locker_FileEval(this, lock)); - return; - } - - if (lock->is_stable()) - file_eval(lock); -} - - - -void Locker::file_eval_gather(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int issued = in->get_caps_issued(); - - dout(7) << "file_eval_gather issued " << cap_string(issued) - << " vs " << cap_string(lock->caps_allowed()) - << " on " << *lock << " on " << *lock->get_parent() - << endl; - - if (lock->is_stable()) - return; // nothing for us to do here! - - // [auth] finished gather? - if (in->is_auth() && - !lock->is_gathering() && - ((issued & ~lock->caps_allowed()) == 0)) { - dout(7) << "file_eval_gather finished gather" << endl; - - switch (lock->get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - if ((issued & ~CAP_FILE_RDCACHE) == 0) { - lock->set_state(LOCK_LOCK); - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - } - break; - - // to mixed - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - lock->set_state(LOCK_MIXED); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - break; - - case LOCK_GMIXEDL: - if ((issued & ~(CAP_FILE_WR)) == 0) { - lock->set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - break; - - // to loner - case LOCK_GLONERR: - if ((issued & ~lock->caps_allowed()) == 0) { - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - break; - - case LOCK_GLONERM: - if ((issued & ~CAP_FILE_WR) == 0) { - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - if ((issued & ~(CAP_FILE_RD)) == 0) { - lock->set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - lock->encode_locked_state(softdata); - - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - } - break; - - default: - assert(0); - } - - issue_caps(in); - - // stable re-eval? - if (lock->is_stable()) //&& lock->get_parent()->can_auth_pin()) - file_eval(lock); - } - - // [replica] finished caps gather? - if (!in->is_auth()) { - switch (lock->get_state()) { - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - case LOCK_GLOCKR: - if (issued == 0) { - lock->set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - default: - assert(0); - } - } - - -} - -void Locker::file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "file_eval wanted=" << cap_string(wanted) - << " filelock=" << *lock << " on " << *lock->get_parent() - << " loner=" << loner - << endl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // not xlocked! - if (lock->is_xlocked()) return; - - // * -> loner? - if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_WR) && - loner && - lock->get_state() != LOCK_LONER) { - dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << endl; - file_loner(lock); - } - - // * -> mixed? - else if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && lock->get_state() == LOCK_LONER) && - lock->get_state() != LOCK_MIXED) { - dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << endl; - file_mixed(lock); - } - - // * -> sync? - else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && - !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && lock->get_state() == LOCK_LONER)) && - lock->get_state() != LOCK_SYNC) { - dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << endl; - file_sync(lock); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - lock->get_state() != LOCK_LOCK) { - file_lock(lock); - } -} - - -// mid - -bool Locker::file_sync(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << endl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - - issue_caps(in); // reissue caps - return true; - } - - else if (lock->get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCM); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - - else if (lock->get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::file_lock(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << endl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - - } - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - lock->set_state(LOCK_GLOCKL); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::file_mixed(FileLock *lock) -{ - dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << endl; - - CInode *in = (CInode*)lock->get_parent(); - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->init_gather(); - - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - - issue_caps(in); - } else { - if (issued) { - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - } - } - } - - else if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - // change lock - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - lock->set_state(LOCK_GMIXEDL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->set_state(LOCK_MIXED); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::file_loner(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << endl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERR); - lock->get_parent()->auth_pin(); - } else { - // only one guy with file open, who gets it all, so - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (lock->get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - lock->set_state(LOCK_LONER); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERM); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - - - -// messenger - -void Locker::handle_file_lock(FileLock *lock, MLock *m) -{ - if (mds->logger) mds->logger->inc("lif"); - - CInode *in = (CInode*)lock->get_parent(); - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (in->is_rejoining()) { - dout(7) << "handle_file_lock still rejoining " << *in - << ", dropping " << *m << endl; - delete m; - return; - } - } - - - dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " - << *in << " filelock=" << *lock << endl; - - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - file_eval_gather(lock); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - lock->set_state(LOCK_GLOCKR); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_file_lock client readers, gathering caps on " << *in << endl; - issue_caps(in); - break; - } - else if (lock->is_rdlocked()) { - dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl; - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); - file_eval_gather(lock); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR || - lock->get_state() == LOCK_GLOCKM || - lock->get_state() == LOCK_GLONERM || - lock->get_state() == LOCK_GLONERR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << endl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_GSYNCM); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << endl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->get_state() == LOCK_GMIXEDR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << endl; - file_eval_gather(lock); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - diff --git a/branches/sage/pgs/mds/Locker.h b/branches/sage/pgs/mds/Locker.h deleted file mode 100644 index b54b0b7b2cafd..0000000000000 --- a/branches/sage/pgs/mds/Locker.h +++ /dev/null @@ -1,183 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - - -class Anchor; -class Capability; - -class SimpleLock; -class FileLock; -class ScatterLock; -class LocalLock; - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info); - - void dispatch(Message *m); - void handle_lock(MLock *m); - -protected: - void send_lock_message(SimpleLock *lock, int msg); - void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); - - // -- locks -- -public: - bool acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks); - - void drop_locks(MDRequest *mdr); - -protected: - bool rdlock_start(SimpleLock *lock, MDRequest *mdr); - void rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool xlock_start(SimpleLock *lock, MDRequest *mdr); -public: - void xlock_finish(SimpleLock *lock, MDRequest *mdr); // public for Server's slave UNXLOCK -protected: - bool wrlock_start(SimpleLock *lock, MDRequest *mdr); - void wrlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - void rejoin_set_state(SimpleLock *lock, int s, list& waiters); - - // simple -public: - void try_simple_eval(SimpleLock *lock); - void simple_eval_gather(SimpleLock *lock); - bool simple_rdlock_try(SimpleLock *lock, Context *con); -protected: - void simple_eval(SimpleLock *lock); - void handle_simple_lock(SimpleLock *lock, MLock *m); - void simple_sync(SimpleLock *lock); - void simple_lock(SimpleLock *lock); - bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - bool dentry_can_rdlock_trace(vector& trace); - void dentry_anon_rdlock_trace_start(vector& trace); - void dentry_anon_rdlock_trace_finish(vector& trace); - - // scatter -public: - void try_scatter_eval(ScatterLock *lock); - void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() - void scatter_eval_gather(ScatterLock *lock); - -protected: - void handle_scatter_lock(ScatterLock *lock, MLock *m); - void scatter_sync(ScatterLock *lock); - void scatter_lock(ScatterLock *lock); - void scatter_scatter(ScatterLock *lock); - void scatter_tempsync(ScatterLock *lock); - bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); - bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); - - void scatter_writebehind(ScatterLock *lock); - class C_Locker_ScatterWB : public Context { - Locker *locker; - ScatterLock *lock; - public: - C_Locker_ScatterWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {} - void finish(int r) { - locker->scatter_writebehind_finish(lock); - } - }; - void scatter_writebehind_finish(ScatterLock *lock); - - // local -protected: - bool local_wrlock_start(LocalLock *lock, MDRequest *mdr); - void local_wrlock_finish(LocalLock *lock, MDRequest *mdr); - bool local_xlock_start(LocalLock *lock, MDRequest *mdr); - void local_xlock_finish(LocalLock *lock, MDRequest *mdr); - - - // file -public: - void file_eval_gather(FileLock *lock); - void try_file_eval(FileLock *lock); -protected: - void file_eval(FileLock *lock); - void handle_file_lock(FileLock *lock, MLock *m); - bool file_sync(FileLock *lock); - void file_lock(FileLock *lock); - void file_mixed(FileLock *lock); - void file_loner(FileLock *lock); - bool file_rdlock_try(FileLock *lock, Context *con); - bool file_rdlock_start(FileLock *lock, MDRequest *mdr); - void file_rdlock_finish(FileLock *lock, MDRequest *mdr); - bool file_xlock_start(FileLock *lock, MDRequest *mdr); - void file_xlock_finish(FileLock *lock, MDRequest *mdr); - - - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - -}; - - -#endif diff --git a/branches/sage/pgs/mds/LogEvent.cc b/branches/sage/pgs/mds/LogEvent.cc deleted file mode 100644 index 687428e47b959..0000000000000 --- a/branches/sage/pgs/mds/LogEvent.cc +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" - -#include "events/ESession.h" -#include "events/ESubtreeMap.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - dout(15) << "decode_log_event type " << type << ", size " << length << endl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString; break; - - case EVENT_SESSION: le = new ESession; break; - case EVENT_SUBTREEMAP: le = new ESubtreeMap; break; - case EVENT_EXPORT: le = new EExport; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; - case EVENT_OPEN: le = new EOpen; break; - - case EVENT_PURGEFINISH: le = new EPurgeFinish; break; - - case EVENT_ANCHOR: le = new EAnchor; break; - case EVENT_ANCHORCLIENT: le = new EAnchorClient; break; - default: - dout(1) << "uh oh, unknown log event type " << type << endl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/branches/sage/pgs/mds/LogEvent.h b/branches/sage/pgs/mds/LogEvent.h deleted file mode 100644 index 917fdbf1af962..0000000000000 --- a/branches/sage/pgs/mds/LogEvent.h +++ /dev/null @@ -1,104 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_SESSION 7 -#define EVENT_SUBTREEMAP 2 -#define EVENT_EXPORT 30 -#define EVENT_IMPORTSTART 31 -#define EVENT_IMPORTFINISH 32 - -#define EVENT_UPDATE 3 -#define EVENT_SLAVEUPDATE 4 -#define EVENT_OPEN 5 - -#define EVENT_ALLOC 10 -#define EVENT_PURGEFINISH 22 - -#define EVENT_ANCHOR 40 -#define EVENT_ANCHORCLIENT 41 - - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - friend class MDLog; - - public: - LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - - /*** live journal ***/ - - /* obsolete() - is this entry committed to primary store, such that - * we can expire it from the journal? - */ - virtual bool has_expired(MDS *m) { - return true; - } - - /* expire() - prod MDS into committing the relevant state so that this - * entry can be expired from the jorunal. - */ - virtual void expire(MDS *m, Context *c) { - assert(0); - c->finish(0); - delete c; - } - - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/branches/sage/pgs/mds/MDBalancer.cc b/branches/sage/pgs/mds/MDBalancer.cc deleted file mode 100644 index 58de2647753e7..0000000000000 --- a/branches/sage/pgs/mds/MDBalancer.cc +++ /dev/null @@ -1,910 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << endl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // balance? - if (true && - mds->get_nodeid() == 0 && - g_conf.mds_bal_interval > 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - g_conf.num_mds > 1 && - now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) { - last_hash = now; - do_hashing(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - if (mds->mdcache->get_root()) - load.root = - mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM]; - // + - // mds->mdcache->get_root()->popularity[MDS_POP_NESTED]; - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - return load; -} - -void MDBalancer::send_heartbeat() -{ - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << endl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map -- how much do i import from whom - map import_map; - set authsubs; - mds->mdcache->get_auth_subtrees(authsubs); - for (set::iterator it = authsubs.begin(); - it != authsubs.end(); - it++) { - CDir *im = *it; - int from = im->inode->authority().first; - if (from == mds->get_nodeid()) continue; - if (im->get_inode()->is_stray()) continue; - import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << endl; - } - - - set up; - mds->get_mds_map()->get_in_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p), - MDS_PORT_BALANCER, MDS_PORT_BALANCER); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl; - - if (!mds->mdcache->get_root()) { - dout(10) << "no root on handle" << endl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << endl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //cout << " load is " << load << " have " << mds_load.size() << endl; - - unsigned cluster_size = mds->get_mds_map()->get_num_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << endl; - dout(0) << "IMPLEMENT ME" << endl; - /* - for (set::iterator it = mds->mdcache->subtrees.begin(); - it != mds->mdcache->subtrees.end(); - it++) { - CDir *dir = *it; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } - */ -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << endl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_hashing() -{ - if (hash_queue.empty()) { - dout(20) << "do_hashing has nothing to do" << endl; - return; - } - - dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; - - for (set::iterator i = hash_queue.begin(); - i != hash_queue.end(); - i++) { - inodeno_t dirino = *i; - CInode *in = mds->mdcache->get_inode(dirino); - if (!in) continue; - /* - CDir *dir = in->dir; - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_hashing hashing " << *dir << endl; - mds->mdcache->migrator->hash_dir(dir); - */ - } - hash_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << endl; - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].root.meta_load() - << " / " << mds_load[whoami].mds_load() - << endl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i " << l << endl; - - if (whoami == i) my_load = l; - total_load += l; - - load_map.insert(pair( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << endl; - - // under or over? - if (my_load < target_load) { - dout(5) << " i am underloaded, doing nothing." << endl; - show_imports(); - return; - } - - dout(5) << " i am overloaded" << endl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << endl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << endl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(5) << " matching exporters to import sources" << endl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(5) << " matching big exporters to big importers" << endl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(5) << " matching small exporters to big importers" << endl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - set fullauthsubs; - - mds->mdcache->get_fullauth_subtrees(fullauthsubs); - for (set::iterator it = fullauthsubs.begin(); - it != fullauthsubs.end(); - it++) { - CDir *im = *it; - if (im->get_inode()->is_stray()) continue; - - double pop = im->popularity[MDS_POP_CURDOM].meta_load(); - if (pop < g_conf.mds_bal_idle_threshold && - im->inode != mds->mdcache->get_root() && - im->inode->authority().first != mds->get_nodeid()) { - dout(-5) << " exporting idle import " << *im - << " back to mds" << im->inode->authority().first - << endl; - mds->mdcache->migrator->export_dir(im, im->inode->authority().first); - continue; - } - import_pop_map[ pop ] = im; - int from = im->inode->authority().first; - dout(15) << " map: i imported " << *im << " from " << from << endl; - import_from_map.insert(pair(from, im)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second;// * load_fac; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - - dout(-5) << " sending " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << endl;//" .. fudge is " << fudge << endl; - double have = 0; - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << endl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << endl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << endl; - mds->mdcache->migrator->export_dir(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << endl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << endl; - have += pop; - mds->mdcache->migrator->export_dir(imp, imp->inode->authority().first); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates; - mds->mdcache->get_fullauth_subtrees(candidates); - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - if ((*pot)->get_inode()->is_stray()) continue; - find_exports(*pot, amount, exports, have, already_exporting); - if (have > amount-MIN_OFFLOAD) { - break; - } - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " exporting to mds" << target - << " fragment " << **it - << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() - << endl; - mds->mdcache->migrator->export_dir(*it, target); - - // hack! only do one dir. - break; - } - } - - dout(5) << "rebalance done" << endl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger; - multimap smaller; - - double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - double dir_sum = 0; - dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - if (!dir->is_auth()) continue; - if (already_exporting.count(dir)) continue; - - if (dir->is_frozen()) continue; // can't export this right now! - //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! - - // how popular? - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - dir_sum += pop; - dout(20) << " pop " << pop << " " << *dir << endl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(dir); - have += pop; - return; - } - - if (pop > need) - bigger.push_back(dir); - else - smaller.insert(pair(pop, dir)); - } - } - dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if ((*it)->is_rep()) continue; - dout(7) << " descending into " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill inot replicated dirs - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if (!(*it)->is_rep()) continue; - dout(7) << " descending into replicated " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(CInode *in, int type) -{ - // hit me - float me = in->popularity[MDS_POP_JUSTME].pop[type].hit(); - float nested = in->popularity[MDS_POP_NESTED].pop[type].hit(); - float curdom = 0; - float anydom = 0; - if (in->is_auth()) { - curdom = in->popularity[MDS_POP_CURDOM].pop[type].hit(); - anydom = in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - dout(20) << "hit_inode " << type << " pop " << me << " me, " - << nested << " nested, " - << curdom << " curdom, " - << anydom << " anydom" - << " on " << *in - << endl; - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(dir, type); -} - - -void MDBalancer::hit_dir(CDir *dir, int type) -{ - // hit me - float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit(); - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth()) { - dout(20) << "hit_dir " << type << " pop " << v << " me " - << *dir << endl; - - // hash this dir? (later?) - if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || - //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || - (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && - hash_queue.count(dir->ino()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; - hash_queue.insert(dir->ino()); - } - - } - - hit_recursive(dir, type); -} - - - -void MDBalancer::hit_recursive(CDir *dir, int type) -{ - bool anydom = dir->is_auth(); - bool curdom = dir->is_auth(); - - float rd_adj = 0.0; - - // replicate? - float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? - - dout(20) << "hit_recursive " << type << " pop " << dir_pop << " curdom " << *dir << endl; - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get(); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; - - dir->dir_rep = CDir::REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); - dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj); - } - - if (!dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; - - dir->dir_rep = CDir::REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - - - while (dir) { - CInode *in = dir->inode; - - dir->popularity[MDS_POP_NESTED].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); - - if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj); - - if (anydom) { - dir->popularity[MDS_POP_ANYDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - if (curdom) { - dir->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - } - - if (dir->is_subtree_root()) - curdom = false; // end of auth domain, stop hitting auth counters. - dir = dir->inode->get_parent_dir(); - } -} - - -/* - * subtract off an exported chunk - */ -void MDBalancer::subtract_export(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_subtree_root(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_subtree_root()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_subtree_root(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_subtree_root()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; - } - -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_subtrees(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/branches/sage/pgs/mds/MDBalancer.h b/branches/sage/pgs/mds/MDBalancer.h deleted file mode 100644 index e6a9488e04f7b..0000000000000 --- a/branches/sage/pgs/mds/MDBalancer.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - utime_t last_heartbeat; - utime_t last_hash; - - // todo - set hash_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_hashing(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(class CInode *in, int type=0); - void hit_dir(class CDir *dir, int type=0); - void hit_recursive(class CDir *dir, int type=0); - - - void show_imports(bool external=false); - -}; - - - -#endif diff --git a/branches/sage/pgs/mds/MDCache.cc b/branches/sage/pgs/mds/MDCache.cc deleted file mode 100644 index 028a7f7baa799..0000000000000 --- a/branches/sage/pgs/mds/MDCache.cc +++ /dev/null @@ -1,5541 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/ESubtreeMap.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EString.h" -#include "events/EPurgeFinish.h" -#include "events/EImportFinish.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - // renamer = new Renamer(mds, this); - root = NULL; - stray = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_log_cap = false; - shutdown_commits = 0; -} - -MDCache::~MDCache() -{ - delete migrator; - //delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load()); - logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load()); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; - //show_cache(); - show_subtrees(); - //dump(); - } - return true; -} - - -// ==================================================================== -// some inode functions - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << endl; - - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - dn->dir->unlink_inode(dn); // leave dentry ... FIXME? - } - - // remove from inode map - inode_map.erase(o->ino()); - - // delete it - delete o; - - if (o == root) root = 0; - if (o == stray) stray = 0; -} - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = MDS_INO_ROOT; - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = - root->inode.mtime = g_clock.now(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - root->force_auth = pair(0, CDIR_AUTH_UNKNOWN); - - set_root( root ); - add_inode( root ); - - return root; -} - - -void MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - CDir *dir = root->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, 0); - dir->dir_rep = CDir::REP_ALL; //NONE; - - show_subtrees(); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - if (waiting_for_root.empty()) { - dout(7) << "discovering root" << endl; - - filepath want; - MDiscover *req = new MDiscover(whoami, - MDS_INO_ROOT, - want, - false); // there _is_ no base dir for the root inode - mds->send_message_mds(req, 0, MDS_PORT_CACHE); - } else { - dout(7) << "waiting for root" << endl; - } - - // wait - waiting_for_root.push_back(c); - - } -} - -CInode *MDCache::create_stray_inode(int whose) -{ - if (whose < 0) whose = mds->get_nodeid(); - stray = new CInode(this, whose == mds->get_nodeid()); - memset(&stray->inode, 0, sizeof(inode_t)); - stray->inode.ino = MDS_INO_STRAY(whose); - - // make it up (FIXME) - stray->inode.mode = 0755 | INODE_MODE_DIR; - stray->inode.size = 0; - stray->inode.ctime = - stray->inode.mtime = g_clock.now(); - - stray->inode.nlink = 1; - stray->inode.layout = g_OSD_MDDirLayout; - - add_inode( stray ); - - return stray; -} - -void MDCache::open_local_stray() -{ - create_stray_inode(); - CDir *dir = stray->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, mds->get_nodeid()); -} - -void MDCache::open_foreign_stray(int who, Context *c) -{ - inodeno_t ino = MDS_INO_STRAY(who); - dout(10) << "open_foreign_stray mds" << who << " " << ino << endl; - assert(!have_inode(ino)); - - // discover - filepath want; - MDiscover *req = new MDiscover(mds->get_nodeid(), - ino, - want, - false); // there _is_ no base dir for the stray inode - mds->send_message_mds(req, who, MDS_PORT_CACHE); - - // wait - waiting_for_stray[ino].push_back(c); -} - - -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) -{ - string straydname; - in->name_stray_dentry(straydname); - frag_t fg = stray->pick_dirfrag(straydname); - - CDir *straydir = stray->get_or_open_dirfrag(this, fg); - - CDentry *straydn = straydir->lookup(straydname); - if (!straydn) - straydn = straydir->add_dentry(straydname, 0); - - return straydn; -} - - - -MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info) -{ - // inode? - if (info.ino) - return get_inode(info.ino); - - // dir or dentry. - CDir *dir = get_dirfrag(info.dirfrag); - if (!dir) return 0; - - if (info.dname.length()) - return dir->lookup(info.dname); - else - return dir; -} - - - - -// ==================================================================== -// subtree management - -void MDCache::list_subtrees(list& ls) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - ls.push_back(p->first); -} - -/* - * adjust the dir_auth of a subtree. - * merge with parent and/or child subtrees, if is it appropriate. - * merge can ONLY happen if both parent and child have unambiguous auth. - */ -void MDCache::adjust_subtree_auth(CDir *dir, pair auth) -{ - dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << endl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) - subtrees[root].clear(); - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << endl; - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << endl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << endl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - - eval_subtree_root(dir); - } - - // adjust export pins - adjust_export_state(dir); - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - adjust_export_state(*p); - - show_subtrees(); -} - - -/* - * any "export" point must be pinned in cache to ensure a proper - * chain of delegation. we do this by pinning when a dir is nonauth - * but the inode is auth. - * - * import points don't need to be pinned the same way simply because the - * exporting mds is pinning the exprot (as above) thus the dir is - * always open on the importer. - */ -void MDCache::adjust_export_state(CDir *dir) -{ - // be auth bit agnostic, so that we work during recovery - // (before recalc_auth_bits) - if (dir->authority().first != mds->get_nodeid() && - dir->inode->authority().first == mds->get_nodeid()) { - // export. - if (!dir->state_test(CDir::STATE_EXPORT)) { - dout(10) << "adjust_export_state pinning new export " << *dir << endl; - dir->state_set(CDir::STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); - } - } - else { - // not export. - if (dir->state_test(CDir::STATE_EXPORT)) { - dout(10) << "adjust_export_state unpinning old export " << *dir << endl; - dir->state_clear(CDir::STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } - } -} - -void MDCache::try_subtree_merge(CDir *dir) -{ - dout(7) << "try_subtree_merge " << *dir << endl; - assert(subtrees.count(dir)); - set oldbounds = subtrees[dir]; - - // try merge at my root - try_subtree_merge_at(dir); - - // try merge at my old bounds - for (set::iterator p = oldbounds.begin(); - p != oldbounds.end(); - ++p) - try_subtree_merge_at(*p); -} - -void MDCache::try_subtree_merge_at(CDir *dir) -{ - dout(10) << "try_subtree_merge_at " << *dir << endl; - assert(subtrees.count(dir)); - - // merge with parent? - CDir *parent = dir; - if (dir->ino() >= MDS_INO_BASE) - parent = get_subtree_root(dir->get_parent_dir()); - - if (parent != dir && // we have a parent, - parent->dir_auth == dir->dir_auth && // auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, - // merge with parent. - dout(10) << " subtree merge at " << *dir << endl; - dir->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move our bounds under the parent - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - subtrees[parent].insert(*p); - - // we are no longer a subtree or bound - subtrees.erase(dir); - subtrees[parent].erase(dir); - - eval_subtree_root(dir); - } - - show_subtrees(15); -} - -void MDCache::eval_subtree_root(CDir *dir) -{ - // evaluate subtree inode dirlock? - // (we should scatter the dirlock on subtree bounds) - if (dir->inode->is_auth() && - dir->inode->dirlock.is_stable()) { - // force the issue a bit - if (!dir->inode->is_frozen()) - mds->locker->scatter_eval(&dir->inode->dirlock); - else - mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** - } -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bounds " << bounds - << endl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) - subtrees[root].clear(); - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << endl; - - pair oldauth = dir->authority(); - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << endl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << endl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - } - - // verify/adjust bounds. - // - these may be new, or - // - beneath existing ambiguous bounds (which will be collapsed), - // - but NOT beneath unambiguous bounds. - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - - // new bound? - if (subtrees[dir].count(bound) == 0) { - if (get_subtree_root(bound) == dir) { - dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << endl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. - } - else { - dout(10) << " want bound " << *bound << endl; - // make sure it's nested beneath ambiguous subtree(s) - while (1) { - CDir *t = get_subtree_root(bound->get_parent_dir()); - if (t == dir) break; - while (subtrees[dir].count(t) == 0) - t = get_subtree_root(t->get_parent_dir()); - dout(10) << " swallowing intervening subtree at " << *t << endl; - adjust_subtree_auth(t, auth); - try_subtree_merge_at(t); - } - } - } - else { - dout(10) << " already have bound " << *bound << endl; - } - } - // merge stray bounds? - set::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set::iterator n = p; - n++; - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << endl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray); - } - p = n; - } - - // adjust export pins - adjust_export_state(dir); - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - adjust_export_state(*p); - - // bound should now match. - verify_subtree_bounds(dir, bounds); - - show_subtrees(); -} - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bound_dfs " << bound_dfs - << endl; - - // make bounds list - set bounds; - for (list::iterator p = bound_dfs.begin(); - p != bound_dfs.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (bd) - bounds.insert(bd); - } - - adjust_bounded_subtree_auth(dir, bounds, auth); -} - - - -CDir *MDCache::get_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_parent_dir(); - if (!dir) - return 0; // none - } -} - -void MDCache::remove_subtree(CDir *dir) -{ - dout(10) << "remove_subtree " << *dir << endl; - assert(subtrees.count(dir)); - assert(subtrees[dir].empty()); - subtrees.erase(dir); - if (dir->get_parent_dir()) { - CDir *p = get_subtree_root(dir->get_parent_dir()); - assert(subtrees[p].count(dir)); - subtrees[p].erase(dir); - } -} - -void MDCache::get_subtree_bounds(CDir *dir, set& bounds) -{ - assert(subtrees.count(dir)); - bounds = subtrees[dir]; -} - -void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) -{ - if (subtrees.count(dir)) { - // just copy them, dir is a subtree. - get_subtree_bounds(dir, bounds); - } else { - // find them - CDir *root = get_subtree_root(dir); - for (set::iterator p = subtrees[root].begin(); - p != subtrees[root].end(); - ++p) { - CDir *t = *p; - while (t != root) { - t = t->get_parent_dir(); - assert(t); - if (t == dir) { - bounds.insert(*p); - continue; - } - } - } - } -} - -void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - if (bounds != subtrees[dir]) { - dout(0) << "verify_subtree_bounds failed" << endl; - set b = bounds; - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - if (bounds.count(*p)) { - b.erase(*p); - continue; - } - dout(0) << " missing bound " << **p << endl; - } - for (set::iterator p = b.begin(); - p != b.end(); - ++p) - dout(0) << " extra bound " << **p << endl; - } - assert(bounds == subtrees[dir]); -} - -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - - // make sure that any bounds i do have are properly noted as such. - int failed = 0; - for (list::const_iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (!bd) continue; - if (subtrees[dir].count(bd) == 0) { - dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << endl; - failed++; - } - } - assert(failed == 0); -} - -void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) -{ - dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << endl; - - //show_subtrees(); - - list dfls; - diri->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - dout(10) << "dirfrag " << *dir << endl; - CDir *oldparent = get_subtree_root(olddir); - dout(10) << " old parent " << *oldparent << endl; - CDir *newparent = get_subtree_root(diri->get_parent_dir()); - dout(10) << " new parent " << *newparent << endl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << endl; - continue; - } - - if (dir->is_subtree_root()) { - // children are fine. change parent. - dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << endl; - assert(subtrees[oldparent].count(dir)); - subtrees[oldparent].erase(dir); - assert(subtrees.count(newparent)); - subtrees[newparent].insert(dir); - } else { - // mid-subtree. - - // see if any old bounds move to the new parent. - list tomove; - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - CDir *broot = get_subtree_root(bound->get_parent_dir()); - if (broot != oldparent) { - assert(broot == newparent); - tomove.push_back(bound); - } - } - for (list::iterator p = tomove.begin(); p != tomove.end(); ++p) { - CDir *bound = *p; - dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << endl; - subtrees[oldparent].erase(bound); - subtrees[newparent].insert(bound); - } - - // did auth change? - if (oldparent->authority() != newparent->authority()) - adjust_subtree_auth(dir, oldparent->authority()); // caller is responsible for *diri. - } - } - - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - // un-force dir to subtree root - if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) - adjust_subtree_auth(dir, dir->dir_auth.first); - } - - show_subtrees(); -} - - -void MDCache::get_fullauth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - s.insert(root); - } -} -void MDCache::get_auth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_auth()) - s.insert(root); - } -} - - -// count. - -int MDCache::num_subtrees() -{ - return subtrees.size(); -} - -int MDCache::num_subtrees_fullauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - n++; - } - return n; -} - -int MDCache::num_subtrees_fullnonauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_nonauth()) - n++; - } - return n; -} - - - - - - - -// ==================================================================== -// import map, recovery - -/* - * take note of where we write import_maps in the log, as we need - * to take care not to expire them until an updated map is safely flushed. - */ -class C_MDS_WroteSubtreeMap : public Context { - MDCache *mdcache; - off_t end_off; -public: - C_MDS_WroteSubtreeMap(MDCache *mc, off_t eo) : mdcache(mc), end_off(eo) { } - void finish(int r) { - mdcache->_logged_subtree_map(end_off); - } -}; - - -void MDCache::log_subtree_map(Context *onsync) -{ - dout(10) << "log_subtree_map " << num_subtrees() << " subtrees, " - << num_subtrees_fullauth() << " fullauth" - << endl; - - ESubtreeMap *le = new ESubtreeMap; - - // include all auth subtrees, and their bounds. - // and a spanning tree to tie it to the root. - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - - dout(15) << " subtree " << *dir << endl; - le->subtrees[dir->dirfrag()].clear(); - le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); - le->metablob.add_dir(dir, false); - - // bounds - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDir *bound = *q; - dout(15) << " subtree bound " << *bound << endl; - le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); - le->metablob.add_dir_context(bound, EMetaBlob::TO_ROOT); - le->metablob.add_dir(bound, false); - } - } - - //le->metablob.print(cout); - - Context *fin = new C_MDS_WroteSubtreeMap(this, mds->mdlog->get_write_pos()); - mds->mdlog->writing_subtree_map = true; - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(fin); - if (onsync) - mds->mdlog->wait_for_sync(onsync); -} - -void MDCache::_logged_subtree_map(off_t off) -{ - dout(10) << "_logged_subtree_map at " << off << endl; - mds->mdlog->last_subtree_map = off; - mds->mdlog->writing_subtree_map = false; - - list ls; - mds->mdlog->take_subtree_map_expire_waiters(ls); - mds->queue_waiters(ls); -} - - -void MDCache::send_resolve(int who) -{ - if (migrator->is_exporting()) - send_resolve_later(who); - else - send_resolve_now(who); -} - -void MDCache::send_resolve_later(int who) -{ - dout(10) << "send_resolve_later to mds" << who << endl; - wants_resolve.insert(who); -} - -void MDCache::maybe_send_pending_resolves() -{ - if (wants_resolve.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting() || - migrator->is_importing()) { - dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << endl; - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_resolve.begin(); - p != wants_resolve.end(); - p++) - send_resolve_now(*p); - wants_resolve.clear(); -} - - -class C_MDC_SendResolve : public Context { - MDCache *mdc; - int who; -public: - C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { } - void finish(int r) { - mdc->send_resolve_now(who); - } -}; - -void MDCache::send_resolve_now(int who) -{ - dout(10) << "send_resolve_now to mds" << who << endl; - MMDSResolve *m = new MMDSResolve; - - show_subtrees(); - - // known - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - - // only our subtrees - if (dir->authority().first != mds->get_nodeid()) - continue; - - if (migrator->is_importing(dir->dirfrag())) { - // ambiguous (mid-import) - m->add_ambiguous_import(dir->dirfrag(), - migrator->get_import_bound_inos(dir->dirfrag())); - } else { - // not ambiguous. - m->add_subtree(dir->dirfrag()); - - // bounds too - for (set::iterator q = subtrees[dir].begin(); - q != subtrees[dir].end(); - ++q) { - CDir *bound = *q; - m->add_subtree_bound(dir->dirfrag(), bound->dirfrag()); - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - - // list prepare requests lacking a commit - // [active survivor] - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - if (p->second->is_slave() && p->second->slave_to_mds == who) { - dout(10) << " including uncommitted " << *p->second << endl; - m->add_slave_request(p->first); - } - } - // [resolving] - if (uncommitted_slave_updates.count(who)) { - for (map::iterator p = uncommitted_slave_updates[who].begin(); - p != uncommitted_slave_updates[who].end(); - ++p) { - dout(10) << " including uncommitted " << p->first << endl; - m->add_slave_request(p->first); - } - need_resolve_ack.insert(who); - } - - - // send - mds->send_message_mds(m, who, MDS_PORT_CACHE); -} - - -void MDCache::handle_mds_failure(int who) -{ - dout(7) << "handle_mds_failure mds" << who << endl; - - // make note of recovery set - mds->mdsmap->get_recovery_mds_set(recovery_set); - recovery_set.erase(mds->get_nodeid()); - dout(1) << "my recovery peers will be " << recovery_set << endl; - - // adjust my recovery lists - wants_resolve.erase(who); // MDS will ask again - got_resolve.erase(who); // i'll get another. - rejoin_ack_gather.erase(who); // i'll need/get another. - - // adjust subtree auth - list subs; - list_subtrees(subs); - for (list::iterator p = subs.begin(); - p != subs.end(); - ++p) { - CDir *dir = *p; - // only if we are a _bystander_. - if (dir->dir_auth.first == who && - dir->dir_auth.second >= 0 && - dir->dir_auth.second != mds->get_nodeid()) { - dout(7) << "disambiguating auth for " << *dir << endl; - adjust_subtree_auth(dir, dir->dir_auth.second); - try_subtree_merge(dir); - } - else if (dir->dir_auth.second == who && - dir->dir_auth.first != mds->get_nodeid()) { - dout(7) << "disambiguating auth for " << *dir << endl; - adjust_subtree_auth(dir, dir->dir_auth.first); - try_subtree_merge(dir); - } - } - - // tell the migrator too. - migrator->handle_mds_failure_or_stop(who); - - // kick any dir discovers that are waiting - hash_map >::iterator p = dir_discovers.begin(); - while (p != dir_discovers.end()) { - hash_map >::iterator n = p; - n++; - - // waiting on this mds? - if (p->second.count(who)) { - CInode *in = get_inode(p->first); - assert(in); - - // take waiters - list waiters; - in->take_waiting(CInode::WAIT_DIR, waiters); - mds->queue_waiters(waiters); - dout(10) << "kicking WAIT_DIR on " << *in << endl; - - // remove from mds list - p->second.erase(who); - if (p->second.empty()) - dir_discovers.erase(p); - } - p = n; - } - - // clean up any requests slave to/from this node - list finish; - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // slave to the failed node? - if (p->second->slave_to_mds == who) { - if (p->second->slave_did_prepare()) { - dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << endl; - } else { - dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << endl; - if (p->second->slave_request) - p->second->aborted = true; - else - finish.push_back(p->second); - } - } - - // failed node is slave? - if (!p->second->committing) { - if (p->second->witnessed.count(who)) { - dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who - << endl; - // discard this peer's prepare (if any) - p->second->witnessed.erase(who); - } - - if (p->second->waiting_on_slave.count(who)) { - dout(10) << " master request " << *p->second << " waiting for slave mds" << who - << " to recover" << endl; - // retry request when peer recovers - p->second->waiting_on_slave.erase(who); - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); - } - } - } - - while (!finish.empty()) { - dout(10) << "cleaning up slave request " << *finish.front() << endl; - request_finish(finish.front()); - finish.pop_front(); - } - - show_subtrees(); -} - -/* - * handle_mds_recovery - called on another node's transition - * from resolve -> active. - */ -void MDCache::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << endl; - - list waiters; - - // wake up any waiters in their subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - if (dir->authority().first != who) continue; - assert(!dir->is_auth()); - - // wake any waiters - list q; - q.push_back(dir); - - while (!q.empty()) { - CDir *d = q.front(); - q.pop_front(); - d->take_waiting(CDir::WAIT_ANY, waiters); - - // inode waiters too - for (CDir_map_t::iterator p = d->items.begin(); - p != d->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->is_primary()) { - dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); - - // recurse? - list ls; - dn->get_inode()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_subtree_root()) - q.push_back(subdir); - } - } - } - } - } - - // queue them up. - mds->queue_waiters(waiters); -} - -void MDCache::set_recovery_set(set& s) -{ - dout(7) << "set_recovery_set " << s << endl; - recovery_set = s; -} - - -/* - * during resolve state, we share resolves to determine who - * is authoritative for which trees. we expect to get an resolve - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_resolve(MMDSResolve *m) -{ - dout(7) << "handle_resolve from " << m->get_source() << endl; - int from = m->get_source().num(); - - // ambiguous slave requests? - if (!m->slave_requests.empty()) { - MMDSResolveAck *ack = new MMDSResolveAck; - - for (list::iterator p = m->slave_requests.begin(); - p != m->slave_requests.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - // COMMIT - dout(10) << " ambiguous slave request " << *p << " will COMMIT" << endl; - ack->add_commit(*p); - } else { - // ABORT - dout(10) << " ambiguous slave request " << *p << " will ABORT" << endl; - ack->add_abort(*p); - } - } - - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } - - // update my dir_auth values - for (map >::iterator pi = m->subtrees.begin(); - pi != m->subtrees.end(); - ++pi) { - CDir *im = get_dirfrag(pi->first); - if (im) { - adjust_bounded_subtree_auth(im, pi->second, from); - try_subtree_merge(im); - } - } - - // am i a surviving ambiguous importer? - /* - * note: it would be cleaner to do this check before updating our own - * subtree map.. then the import_finish or _reverse could operate on an - * un-munged subtree map. but... checking for import completion against - * the provided resolve isn't easy. so, we skip audit checks in these - * functions. - */ - if (mds->is_active() || mds->is_stopping()) { - // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); - while (p != my_ambiguous_imports.end()) { - map >::iterator n = p; - n++; - CDir *dir = get_dirfrag(p->first); - assert(dir); - dout(10) << "checking ambiguous import " << *dir << endl; - if (migrator->is_importing(dir->dirfrag())) { - assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); - if (migrator->get_import_peer(dir->dirfrag()) == from) { - if (dir->is_ambiguous_dir_auth()) { - dout(7) << "ambiguous import succeeded on " << *dir << endl; - migrator->import_finish(dir, true); // don't wait for log flush - } else { - dout(7) << "ambiguous import failed on " << *dir << endl; - migrator->import_reverse(dir, false); // don't adjust dir_auth. - } - my_ambiguous_imports.erase(p); - } - } - p = n; - } - } - - show_subtrees(); - - - // resolving? - if (mds->is_resolve()) { - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - got_resolve.insert(from); - - maybe_resolve_finish(); - } - - delete m; -} - -void MDCache::maybe_resolve_finish() -{ - if (got_resolve != recovery_set) { - dout(10) << "still waiting for more resolves, got (" << got_resolve - << "), need (" << recovery_set << ")" << endl; - } - else if (!need_resolve_ack.empty()) { - dout(10) << "still waiting for resolve_ack from (" << need_resolve_ack << ")" << endl; - } - else { - dout(10) << "got all import maps, resolve_acks, done resolving subtrees" << endl; - disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); - - mds->resolve_done(); - } -} - -void MDCache::handle_resolve_ack(MMDSResolveAck *ack) -{ - dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << endl; - int from = ack->get_source().num(); - - for (list::iterator p = ack->commit.begin(); - p != ack->commit.end(); - ++p) { - dout(10) << " commit on slave " << *p << endl; - - if (mds->is_resolve()) { - // replay - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].replay(mds); - uncommitted_slave_updates[from].erase(*p); - // log commit - mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_COMMIT)); - } else { - MDRequest *mdr = request_get(*p); - assert(mdr->slave_request == 0); // shouldn't be doing anything! - request_finish(mdr); - } - } - - for (list::iterator p = ack->abort.begin(); - p != ack->abort.end(); - ++p) { - dout(10) << " abort on slave " << *p << endl; - - if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); - } else { - MDRequest *mdr = request_get(*p); - if (mdr->slave_commit) { - mdr->slave_commit->finish(-1); - delete mdr->slave_commit; - mdr->slave_commit = 0; - } - if (mdr->slave_request) - mdr->aborted = true; - else - request_finish(mdr); - } - } - - need_resolve_ack.erase(from); - - if (mds->is_resolve()) - maybe_resolve_finish(); - - delete ack; -} - - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << endl; - - // FIXME what about surviving bystanders - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.end(); - ++p) { - int who = p->first; - dout(10) << "ambiguous imports for mds" << who << endl; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " ambiguous import " << q->first << " bounds " << q->second << endl; - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first == CDIR_AUTH_UNKNOWN) { - dout(10) << "mds" << who << " did import " << *dir << endl; - adjust_bounded_subtree_auth(dir, q->second, who); - try_subtree_merge(dir); - } else { - dout(10) << "mds" << who << " did not import " << *dir << endl; - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; - cancel_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, false)); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl; - finish_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - } - } - assert(my_ambiguous_imports.empty()); - - show_subtrees(); -} - - -void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) -{ - assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base].swap( bounds ); -} - - -void MDCache::add_ambiguous_import(CDir *base, const set& bounds) -{ - // make a list - list binos; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - binos.push_back((*p)->dirfrag()); - - // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->dirfrag())) - my_ambiguous_imports.erase(base->dirfrag()); - - add_ambiguous_import(base->dirfrag(), binos); -} - -void MDCache::cancel_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - dout(10) << "cancel_ambiguous_import " << df - << " bounds " << my_ambiguous_imports[df] - << endl; - my_ambiguous_imports.erase(df); -} - -void MDCache::finish_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - list bound_inos; - bound_inos.swap(my_ambiguous_imports[df]); - my_ambiguous_imports.erase(df); - - dout(10) << "finish_ambiguous_import " << df - << " bounds " << bound_inos - << endl; - CDir *dir = get_dirfrag(df); - assert(dir); - - // adjust dir_auth, import maps - adjust_bounded_subtree_auth(dir, bound_inos, mds->get_nodeid()); - try_subtree_merge(dir); -} - - -/** recalc_auth_bits() - * once subtree auth is disambiguated, we need to adjust all the - * auth and dirty bits in our cache before moving on. - */ -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << endl; - - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - list dfq; // dirfrag queue - dfq.push_back(p->first); - - bool auth = p->first->authority().first == mds->get_nodeid(); - dout(10) << " subtree auth=" << auth << " for " << *p->first << endl; - - while (!dfq.empty()) { - CDir *dir = dfq.front(); - dfq.pop_front(); - - // dir - if (auth) - dir->state_set(CDir::STATE_AUTH); - else { - dir->state_set(CDir::STATE_REJOINING); - dir->state_clear(CDir::STATE_AUTH); - if (dir->is_dirty()) - dir->mark_clean(); - } - - // dentries in this dir - for (map::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - // dn - CDentry *dn = q->second; - if (auth) - dn->state_set(CDentry::STATE_AUTH); - else { - dn->state_set(CDentry::STATE_REJOINING); - dn->state_clear(CDentry::STATE_AUTH); - if (dn->is_dirty()) - dn->mark_clean(); - } - - if (dn->is_primary()) { - // inode - if (auth) - dn->inode->state_set(CInode::STATE_AUTH); - else { - dn->inode->state_set(CInode::STATE_REJOINING); - dn->inode->state_clear(CInode::STATE_AUTH); - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - } - - // recurse? - if (dn->inode->is_dir()) - dn->inode->get_nested_dirfrags(dfq); - } - } - } - } - - show_subtrees(); - show_cache(); -} - - - -// =========================================================================== -// REJOIN - - -/* - * rejoin phase! - * we start out by sending rejoins to everyone in the recovery set. - * - * if we are rejoin, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::rejoin_send_rejoins() -{ - dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << endl; - - map rejoins; - - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::_encode(cap_exports, cap_export_bl); - ::_encode(cap_export_paths, cap_export_bl); - } - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (mds->is_rejoin()) { - rejoin_gather.insert(*p); - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } - - assert(!migrator->is_importing()); - assert(!migrator->is_exporting()); - - // check all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - assert(dir->is_subtree_root()); - assert(!dir->is_ambiguous_dir_auth()); - - int auth = dir->get_dir_auth().first; - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - - rejoin_walk(dir, rejoins[auth]); - } - - if (!mds->is_rejoin()) { - // strong. - // note request authpins, xlocks - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // auth pins - for (set::iterator q = p->second->auth_pins.begin(); - q != p->second->auth_pins.end(); - ++q) { - if (!(*q)->is_auth()) { - int who = (*q)->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " authpin on " << **q << endl; - MDSCacheObjectInfo i; - (*q)->set_object_info(i); - if (i.ino) - rejoin->add_inode_authpin(i.ino, p->second->reqid); - else - rejoin->add_dentry_authpin(i.dirfrag, i.dname, p->second->reqid); - } - } - // xlocks - for (set::iterator q = p->second->xlocks.begin(); - q != p->second->xlocks.end(); - ++q) { - if (!(*q)->get_parent()->is_auth()) { - int who = (*q)->get_parent()->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " xlock on " << **q << " " << *(*q)->get_parent() << endl; - MDSCacheObjectInfo i; - (*q)->get_parent()->set_object_info(i); - if (i.ino) - rejoin->add_inode_xlock(i.ino, (*q)->get_type(), p->second->reqid); - else - rejoin->add_dentry_xlock(i.dirfrag, i.dname, p->second->reqid); - } - } - } - } - - // send the messages - assert(rejoin_ack_gather.empty()); - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - rejoin_ack_gather.insert(p->first); - } - - // nothing? - if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing left to rejoin" << endl; - mds->rejoin_done(); - } -} - - -/** - * rejoin_walk - build rejoin declarations for a subtree - * - * @dir subtree root - * @rejoin rejoin message - * - * from a rejoining node: - * weak dirfrag - * weak dentries (w/ connectivity) - * - * from a surviving node: - * strong dirfrag - * strong dentries (no connectivity!) - * strong inodes - */ -void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "rejoin_walk " << *dir << endl; - - list nested; // finish this dir, then do nested items - - if (mds->is_rejoin()) { - // WEAK - rejoin->add_weak_dirfrag(dir->dirfrag()); - - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->is_primary()) { - rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); - dn->get_inode()->get_nested_dirfrags(nested); - } else if (dn->is_remote()) - rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, dn->get_remote_ino()); - else - assert(0); // i shouldn't have a non-auth null dentry after replay + trim_non_auth() - } - } else { - // STRONG - rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce()); - - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - rejoin->add_strong_dentry(dir->dirfrag(), p->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->get_replica_nonce(), - dn->lock.get_state()); - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - in->get_nested_dirfrags(nested); - } - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() << endl; - - switch (m->op) { - case MMDSCacheRejoin::OP_WEAK: - handle_cache_rejoin_weak(m); - break; - case MMDSCacheRejoin::OP_STRONG: - handle_cache_rejoin_strong(m); - break; - - case MMDSCacheRejoin::OP_ACK: - handle_cache_rejoin_ack(m); - break; - case MMDSCacheRejoin::OP_PURGE: - handle_cache_rejoin_purge(m); - break; - case MMDSCacheRejoin::OP_MISSING: - handle_cache_rejoin_missing(m); - break; - - case MMDSCacheRejoin::OP_FULL: - handle_cache_rejoin_full(m); - break; - - default: - assert(0); - } - delete m; -} - - -/* - * handle_cache_rejoin_weak - * - * the sender - * - is recovering from their journal. - * - may have incorrect (out of date) inode contents - * - * if the sender didn't trim_non_auth(), they - * - may have incorrect (out of date) dentry/inode linkage - * - may have deleted/purged inodes - * and i may have to go to disk to get accurate inode contents. yuck. - */ -void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) -{ - int from = weak->get_source().num(); - - // possible response(s) - MMDSCacheRejoin *ack = 0; // if survivor - MMDSCacheRejoin *purge = 0; // if i'm missing something, purge it from the (recovering) sender. - bool survivor = false; // am i a survivor? - - if (mds->is_active() || mds->is_stopping()) { - survivor = true; - dout(10) << "i am a surivivor, and will ack immediately" << endl; - ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // check cap exports - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << " on " << *in << endl; - rejoin_import_cap(in, q->first, q->second, from); - } - } - } else { - assert(mds->is_rejoin()); - - // check cap exports. - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (in && !in->is_auth()) continue; - if (!in) { - if (!path_is_mine(weak->cap_export_paths[p->first])) - continue; - cap_import_paths[p->first] = weak->cap_export_paths[p->first]; - dout(10) << " noting cap import " << p->first << " path " << weak->cap_export_paths[p->first] << endl; - } - - // note - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << endl; - cap_imports[p->first][q->first][from] = q->second; - } - } - } - - // walk weak map - for (map >::iterator p = weak->weak.begin(); - p != weak->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - dout(10) << " purge " << p->first << endl; - if (!purge) purge = new MMDSCacheRejoin(MMDSCacheRejoin::OP_PURGE); - purge->add_weak_dirfrag(p->first, p->second); - continue; - } - - int nonce = dir->add_replica(from); - dout(10) << " have " << *dir << endl; - if (ack) - ack->add_strong_dirfrag(p->first, nonce); - - // weak dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn || - (dn->is_primary() && !q->second.is_primary())) { // make sure dn type matches, or purge - dout(10) << " purge " << p->first << " " << q->first << endl; - if (!purge) purge = new MMDSCacheRejoin(MMDSCacheRejoin::OP_PURGE); - purge->add_weak_null_dentry(p->first, q->first); - continue; - } - - if (survivor) dentry_remove_replica(dn, from); - int nonce = dn->add_replica(from); - dout(10) << " have " << *dn << endl; - if (ack) - ack->add_strong_dentry(p->first, q->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - nonce, dn->lock.get_replica_state()); - - // inode? - if (dn->is_primary()) { - assert(q->second.is_primary()); // or we would have purged, above - CInode *in = dn->get_inode(); - assert(in); - - if (survivor) inode_remove_replica(in, from); - int nonce = in->add_replica(from); - dout(10) << " have " << *in << endl; - - // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) - in->dirlock.set_state(LOCK_SCATTER); - - if (ack) { - ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack->add_strong_inode(in->ino(), - nonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - } - } - } - - if (survivor) - rejoin_scour_survivor_replicas(from, ack); - - // send purge? - // (before ack) - if (purge) { - assert(0); // not if sender did trim_non_auth(). - mds->send_message_mds(purge, from, MDS_PORT_CACHE); - } - - if (survivor) { - // send ack - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } else { - // done? - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; - } - } -} - - -/** - * parallel_fetch -- make a pass at fetching a bunch of paths in parallel - * - * @pathmap - map of inodeno to full pathnames. we remove items from this map - * as we discover we have them. - * @retry - non-completion callback context. called when a pass of fetches - * completes. deleted if we are done (i.e. pathmap is empty). - */ -bool MDCache::parallel_fetch(map& pathmap, - Context *retry) -{ - dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << endl; - - // scan list - set fetch_queue; - map::iterator p = pathmap.begin(); - while (p != pathmap.end()) { - CInode *in = get_inode(p->first); - if (in) { - dout(15) << " have " << *in << endl; - pathmap.erase(p++); - continue; - } - - // traverse - dout(17) << " missing " << p->first << " at " << p->second << endl; - filepath path(p->second); - CDir *dir = path_traverse_to_dir(path); - assert(dir); - fetch_queue.insert(dir); - p++; - } - - if (pathmap.empty()) { - dout(10) << "parallel_fetch done" << endl; - assert(fetch_queue.empty()); - delete retry; - return true; - } - - // do a parallel fetch - C_Gather *gather = new C_Gather(retry); - for (set::iterator p = fetch_queue.begin(); - p != fetch_queue.end(); - ++p) { - dout(10) << "parallel_fetch fetching " << **p << endl; - (*p)->fetch(gather->new_sub()); - } - - return false; -} - - - -/* - * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects - * - * all validated replicas are acked with a strong nonce, etc. if that isn't in the - * ack, the replica dne, and we can remove it from our replica maps. - */ -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) -{ - dout(10) << "rejoin_scour_survivor_replicas from mds" << from << endl; - - // FIXME: what about root and stray inodes. - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - - // inode? - if (in->is_auth() && - in->is_replica(from) && - ack->strong_inodes.count(p->second->ino()) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << endl; - } - - if (!in->is_dir()) continue; - - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); - p != dfs.end(); - ++p) { - CDir *dir = *p; - - if (dir->is_auth() && - dir->is_replica(from) && - ack->strong_dirfrags.count(dir->dirfrag())) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << endl; - } - - // dentries - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - - if (dn->is_replica(from) && - (ack->strong_dentries.count(dir->dirfrag()) == 0 || - ack->strong_dentries[dir->dirfrag()].count(dn->get_name()) == 0)) { - dentry_remove_replica(dn, from); - dout(10) << " rem " << *dn << endl; - } - } - } - } -} - - -CInode *MDCache::rejoin_invent_inode(inodeno_t ino) -{ - CInode *in = new CInode(this); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = ino; - in->state_set(CInode::STATE_REJOINUNDEF); - add_inode(in); - rejoin_undef_inodes.insert(in); - dout(10) << " invented " << *in << endl; - return in; -} - - -void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) -{ - int from = strong->get_source().num(); - - // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); - - MMDSCacheRejoin *missing = 0; // if i'm missing something.. - - // strong dirfrags/dentries. - // also process auth_pins, xlocks. - for (map::iterator p = strong->strong_dirfrags.begin(); - p != strong->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - CInode *in = get_inode(p->first.ino); - if (!in) in = rejoin_invent_inode(p->first.ino); - if (!in->is_dir()) { - assert(in->state_test(CInode::STATE_REJOINUNDEF)); - in->inode.mode = INODE_MODE_DIR; - } - dir = in->get_or_open_dirfrag(this, p->first.frag); - } else { - dir->add_replica(from); - dout(10) << " have " << *dir << endl; - } - - for (map::iterator q = strong->strong_dentries[p->first].begin(); - q != strong->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) { - if (q->second.is_remote()) { - dn = dir->add_dentry(q->first, q->second.remote_ino); - } else if (q->second.is_null()) { - dn = dir->add_dentry(q->first); - } else { - CInode *in = get_inode(q->second.ino); - if (!in) in = rejoin_invent_inode(q->second.ino); - dn = dir->add_dentry(q->first, in); - - dout(10) << " missing " << q->second.ino << endl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_inode(q->second.ino); // we want it back! - } - dout(10) << " invented " << *dn << endl; - } - - // dn auth_pin? - if (strong->authpinned_dentries.count(p->first) && - strong->authpinned_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->authpinned_dentries[p->first][q->first]; - dout(10) << " dn authpin by " << ri << " on " << *dn << endl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(dn); - } - - // dn xlock? - if (strong->xlocked_dentries.count(p->first) && - strong->xlocked_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->xlocked_dentries[p->first][q->first]; - dout(10) << " dn xlock by " << ri << " on " << *dn << endl; - MDRequest *mdr = request_get(ri); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(dn)); - dn->lock.set_state(LOCK_LOCK); - dn->lock.get_xlock(mdr); - mdr->xlocks.insert(&dn->lock); - mdr->locks.insert(&dn->lock); - } - - dn->add_replica(from); - dout(10) << " have " << *dn << endl; - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - assert(strong->strong_inodes.count(in->ino())); - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) - << " on " << *in << endl; - } - - // scatterlock? - if (is.dirlock == LOCK_SCATTER || - is.dirlock == LOCK_GLOCKC) // replica still has wrlocks - in->dirlock.set_state(LOCK_SCATTER); - - // auth pin? - if (strong->authpinned_inodes.count(in->ino())) { - metareqid_t ri = strong->authpinned_inodes[in->ino()]; - dout(10) << " inode authpin by " << ri << " on " << *in << endl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(in); - } - - // xlock(s)? - if (strong->xlocked_inodes.count(in->ino())) { - for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); - r != strong->xlocked_inodes[in->ino()].end(); - ++r) { - SimpleLock *lock = in->get_lock(r->first); - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << endl; - MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - } - } - - in->add_replica(from); - dout(10) << " have " << *in << endl; - } - } - } - - // send missing? - if (missing) { - mds->send_message_mds(missing, from, MDS_PORT_CACHE); - } else { - // done? - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; - } - } -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) -{ - dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << endl; - int from = ack->get_source().num(); - - bool rejoin = mds->is_rejoin(); - - list waiters; - - // dirs - for (map::iterator p = ack->strong_dirfrags.begin(); - p != ack->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - - dir->set_replica_nonce(p->second.nonce); - dir->state_clear(CDir::STATE_REJOINING); - dout(10) << " got " << *dir << endl; - - // dentries - for (map::iterator q = ack->strong_dentries[p->first].begin(); - q != ack->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) continue; - - dn->set_replica_nonce(q->second.nonce); - mds->locker->rejoin_set_state(&dn->lock, q->second.lock, waiters); - dn->state_clear(CDentry::STATE_REJOINING); - dout(10) << " got " << *dn << endl; - } - } - - // full inodes - if (rejoin) { - for (list::iterator p = ack->full_inodes.begin(); - p != ack->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - dout(10) << " got inode content " << *in << endl; - } - } - - // inodes - for (map::iterator p = ack->strong_inodes.begin(); - p != ack->strong_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - in->set_replica_nonce(p->second.nonce); - mds->locker->rejoin_set_state(&in->authlock, p->second.authlock, waiters); - mds->locker->rejoin_set_state(&in->linklock, p->second.linklock, waiters); - mds->locker->rejoin_set_state(&in->dirfragtreelock, p->second.dirfragtreelock, waiters); - mds->locker->rejoin_set_state(&in->filelock, p->second.filelock, waiters); - mds->locker->rejoin_set_state(&in->dirlock, p->second.dirlock, waiters); - in->state_clear(CInode::STATE_REJOINING); - dout(10) << " got " << *in << endl; - } - - // done? - rejoin_ack_gather.erase(from); - if (mds->is_rejoin() && - rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. - rejoin_ack_gather.empty()) { - mds->rejoin_done(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" - << ", rejoin_ack from (" << rejoin_ack_gather << ")" << endl; - } -} - - -void MDCache::handle_cache_rejoin_purge(MMDSCacheRejoin *purge) -{ - dout(7) << "handle_cache_rejoin_purge from " << purge->get_source() << endl; - assert(mds->is_rejoin()); - - /* - * this is tricky, because we have to trim our cache - * in a particular order, and our input (purge->weak) is sorted - * by dirfrag_t. - * - * so, we carelessly trim, and assuming disconnected inodes will be - * clean in the end... - */ - set disconnected; - - for (map >::iterator p = purge->weak.begin(); - p != purge->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - assert(dir); - - // dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dir->unlink_inode(dn); - - if (in->has_dirfrags()) { - dout(10) << " disconnecting inode with dirfrags " << *in << endl; - disconnected.insert(in); - } else { - dout(10) << " removing " << *in << endl; - remove_inode(in); - } - } - - dout(10) << " removing " << *dn << endl; - dir->remove_dentry(dn); - } - - if (dir->items.empty()) { - // purge the dir, too. - CInode *diri = dir->get_inode(); - - dout(10) << " closing dirfrag " << *dir << endl; - diri->close_dirfrag(dir->dirfrag().frag); - - // FIXME: what about root, stray. - - if (!diri->get_parent_dn() && - !diri->has_dirfrags()) { - dout(10) << " removing " << *diri << endl; - remove_inode(diri); - disconnected.erase(diri); - } - } - } - - for (set::iterator p = disconnected.begin(); - p != disconnected.end(); - ++p) - dout(0) << " PROBLEM: still have disconnected dir inode " << **p << endl; - assert(disconnected.empty()); -} - - -void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) -{ - dout(7) << "handle_cache_rejoin_missing from " << missing->get_source() << endl; - - MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); - - // inodes - for (set::iterator p = missing->weak_inodes.begin(); - p != missing->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - if (!in) { - dout(10) << " don't have inode " << *p << endl; - continue; // we must have trimmed it after the originalo rejoin - } - - dout(10) << " sending " << *in << endl; - full->add_full_inode(in->inode, in->symlink, in->dirfragtree); - } - - mds->send_message_mds(full, missing->get_source().num(), MDS_PORT_CACHE); -} - -void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) -{ - dout(7) << "handle_cache_rejoin_full from " << full->get_source() << endl; - int from = full->get_source().num(); - - // integrate full inodes - for (list::iterator p = full->full_inodes.begin(); - p != full->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - assert(in); - - set::iterator q = rejoin_undef_inodes.find(in); - if (q != rejoin_undef_inodes.end()) { - CInode *in = *q; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - in->state_clear(CInode::STATE_REJOINUNDEF); - dout(10) << " got full " << *in << endl; - rejoin_undef_inodes.erase(q); - } else { - dout(10) << " had full " << *in << endl; - } - } - - // done? - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; - } -} - - - -/** - * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes - * - * FIXME: wait, can this actually happen? a survivor should generate cache trim - * messages that clean these guys up... - */ -void MDCache::rejoin_trim_undef_inodes() -{ - dout(10) << "rejoin_trim_undef_inodes" << endl; - - set::iterator p = rejoin_undef_inodes.begin(); - while (p != rejoin_undef_inodes.end()) { - CInode *in = *p; - in->clear_replicas(); - - // close out dirfrags - if (in->is_dir()) { - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - dir->clear_replicas(); - - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->clear_replicas(); - - dout(10) << " trimming " << *dn << endl; - dir->remove_dentry(dn); - } - - dout(10) << " trimming " << *dir << endl; - in->close_dirfrag(dir->dirfrag().frag); - } - } - - CDentry *dn = in->get_parent_dn(); - if (dn) { - dn->clear_replicas(); - dout(10) << " trimming " << *dn << endl; - dn->dir->remove_dentry(dn); - } else { - dout(10) << " trimming " << *in << endl; - remove_inode(in); - } - } - - assert(rejoin_undef_inodes.empty()); // hmm: this shouldn't ever happen, actually! - rejoin_undef_inodes.clear(); -} - -class C_MDC_RejoinGatherFinish : public Context { - MDCache *cache; -public: - C_MDC_RejoinGatherFinish(MDCache *c) : cache(c) {} - void finish(int r) { - cache->rejoin_gather_finish(); - } -}; - -void MDCache::rejoin_gather_finish() -{ - dout(10) << "rejoin_gather_finish" << endl; - assert(mds->is_rejoin()); - - rejoin_trim_undef_inodes(); - - // fetch paths? - if (!cap_import_paths.empty() && - !parallel_fetch(cap_import_paths, new C_MDC_RejoinGatherFinish(this))) - return; - - // process cap imports - // ino -> client -> frommds -> capex - for (map > >::iterator p = cap_imports.begin(); - p != cap_imports.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - mds->server->add_reconnected_cap_inode(in); - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map::iterator r = q->second.begin(); - r != q->second.end(); - ++r) - if (r->first >= 0) - rejoin_import_cap(in, q->first, r->second, r->first); - } - - mds->server->process_reconnected_caps(); - - rejoin_send_acks(); - - // did we already get our acks too? - // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange. - if (rejoin_ack_gather.empty()) - mds->rejoin_done(); -} - -void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds) -{ - dout(10) << "rejoin_import_cap for client" << client << " from mds" << frommds - << " on " << *in << endl; - - // add cap - in->reconnect_cap(client, icr); - - // send REAP - // FIXME client session weirdness. - MClientFileCaps *reap = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[client].get_last_seq(), - in->client_caps[client].pending(), - in->client_caps[client].wanted()); - - reap->set_mds( frommds ); // reap from whom? - mds->messenger->send_message(reap, - mds->clientmap.get_inst(client), - 0, MDS_PORT_CACHE); -} - -void MDCache::rejoin_send_acks() -{ - dout(7) << "rejoin_send_acks" << endl; - - // send acks to everyone in the recovery set - map ack; - set weak; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - ack[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - if (mds->mdsmap->is_rejoin(*p)) weak.insert(*p); - } - - // walk subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - dout(10) << "subtree " << *dir << endl; - - // auth items in this subtree - list dq; - dq.push_back(dir); - - while (!dq.empty()) { - CDir *dir = dq.front(); - dq.pop_front(); - - // dir - for (map::iterator r = dir->replicas_begin(); - r != dir->replicas_end(); - ++r) - ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second); - - for (map::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - - // dentry - for (map::iterator r = dn->replicas_begin(); - r != dn->replicas_end(); - ++r) - ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - r->second, - dn->lock.get_replica_state()); - - if (!dn->is_primary()) continue; - - // inode - CInode *in = dn->inode; - - for (map::iterator r = in->replicas_begin(); - r != in->replicas_end(); - ++r) { - if (weak.count(r->first)) - ack[r->first]->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack[r->first]->add_strong_inode(in->ino(), r->second, 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - // subdirs in this subtree? - in->get_nested_dirfrags(dq); - } - } - } - - // send acks - for (map::iterator p = ack.begin(); - p != ack.end(); - ++p) - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - -} - - - -// =============================================================================== - -/* -void MDCache::rename_file(CDentry *srcdn, - CDentry *destdn) -{ - CInode *in = srcdn->inode; - - // unlink src - srcdn->dir->unlink_inode(srcdn); - - // unlink old inode? - if (destdn->inode) destdn->dir->unlink_inode(destdn); - - // link inode w/ dentry - destdn->dir->link_inode( destdn, in ); -} -*/ - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - root->state_set(CInode::STATE_ROOT); -} - - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - inodeno_t ino; - off_t newsize; -public: - C_MDC_PurgeFinish(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} - void finish(int r) { - mdc->purge_inode_finish(ino, newsize); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - inodeno_t ino; - off_t newsize; -public: - C_MDC_PurgeFinish2(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} - void finish(int r) { - mdc->purge_inode_finish_2(ino, newsize); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir or truncate - * caller responsible for journaling an appropriate EUpdate - */ -void MDCache::purge_inode(inode_t *inode, off_t newsize) -{ - dout(10) << "purge_inode " << inode->ino << " size " << inode->size - << " -> " << newsize - << endl; - - // take note - assert(purging[inode->ino].count(newsize) == 0); - purging[inode->ino][newsize] = *inode; - - assert(inode->size > newsize); - _do_purge_inode(inode, newsize); -} - -void MDCache::_do_purge_inode(inode_t *inode, off_t newsize) -{ - // remove - if (inode->size > 0) { - mds->filer->remove(*inode, newsize, inode->size, - 0, new C_MDC_PurgeFinish(this, inode->ino, newsize)); - } else { - // no need, empty file, just log it - purge_inode_finish(inode->ino, newsize); - } -} - -void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize) -{ - dout(10) << "purge_inode_finish " << ino << " to " << newsize - << " - logging our completion" << endl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(ino, newsize), - new C_MDC_PurgeFinish2(this, ino, newsize)); -} - -void MDCache::purge_inode_finish_2(inodeno_t ino, off_t newsize) -{ - dout(10) << "purge_inode_finish_2 " << ino << " to " << newsize << endl; - - // remove from purging list - purging[ino].erase(newsize); - if (purging[ino].empty()) - purging.erase(ino); - - // tell anyone who cares (log flusher?) - list ls; - ls.swap(waiting_for_purge[ino][newsize]); - waiting_for_purge[ino].erase(newsize); - if (waiting_for_purge[ino].empty()) - waiting_for_purge.erase(ino); - finish_contexts(ls, 0); -} - -void MDCache::add_recovered_purge(const inode_t& inode, off_t newsize) -{ - assert(purging[inode.ino].count(newsize) == 0); - purging[inode.ino][newsize] = inode; -} - -void MDCache::remove_recovered_purge(inodeno_t ino, off_t newsize) -{ - purging[ino].erase(newsize); -} - -void MDCache::start_recovered_purges() -{ - dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << endl; - - for (map >::iterator p = purging.begin(); - p != purging.end(); - ++p) { - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << "start_recovered_purges " << p->first - << " size " << q->second.size - << " to " << q->first << endl; - _do_purge_inode(&q->second, q->first); - } - } -} - - - -// ================================================================================ -// cache trimming - - -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; - - map expiremap; - - // DENTRIES from the LRU - - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - trim_dentry(dn, expiremap); - } - - // trim root inode+dir? - if (max == 0 && // only if we're trimming everything! - lru.lru_get_size() == 0) { - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator n = p; - n++; - - CInode *in = p->second; - - list ls; - in->get_dirfrags(ls); - for (list::iterator q = ls.begin(); - q != ls.end(); - ++q) - if ((*q)->get_num_ref() == 0) - trim_dirfrag(*q, *q, expiremap); - - // root inode? - if (in->get_num_ref() == 0) - trim_inode(0, in, 0, expiremap); // hrm, FIXME - - p = n; - } - } - - // send! - send_expire_messages(expiremap); - - return true; -} - -void MDCache::send_expire_messages(map& expiremap) -{ - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } -} - - -void MDCache::trim_dentry(CDentry *dn, map& expiremap) -{ - dout(12) << "trim_dentry " << *dn << endl; - - CDir *dir = dn->get_dir(); - assert(dir); - - CDir *con = get_subtree_root(dir); - assert(con); - - dout(12) << " in container " << *con << endl; - - // notify dentry authority? - if (!dn->is_auth()) { - pair auth = dn->authority(); - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dn << endl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); - } - } - - // unlink the dentry - if (dn->is_remote()) { - // just unlink. - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - // expire the inode, too. - CInode *in = dn->get_inode(); - assert(in); - trim_inode(dn, in, con, expiremap); - } - else { - assert(dn->is_null()); - } - - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - if (!(dn->is_null() && dn->is_clean())) - dir->state_clear(CDir::STATE_COMPLETE); - - // remove dentry - dir->remove_dentry(dn); - - // reexport? - if (dir->get_size() == 0 && dir->is_subtree_root()) - migrator->export_empty_import(dir); - - if (mds->logger) mds->logger->inc("cex"); -} - - -void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) -{ - assert(dir->get_num_ref() == 0); - - dout(15) << "trim_dirfrag " << *dir << endl; - - CInode *in = dir->get_inode(); - - if (!dir->is_auth()) { - pair auth = dir->authority(); - - // was this an auth delegation? (if so, slightly modified container) - dirfrag_t condf; - if (dir->is_subtree_root()) { - dout(12) << " subtree root, container is " << *dir << endl; - con = dir; - condf = dir->dirfrag(); - } else { - condf = con->dirfrag(); - } - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dir << endl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); - } - } - - if (dir->is_subtree_root()) - remove_subtree(dir); // remove from subtree map - in->close_dirfrag(dir->dirfrag().frag); -} - -void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) -{ - dout(15) << "trim_inode " << *in << endl; - assert(in->get_num_ref() == 0); - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p - - // INODE - if (!in->is_auth()) { - pair auth = in->authority(); - - dirfrag_t df; - if (con) - df = con->dirfrag(); - else - df = dirfrag_t(0,frag_t()); // must be a root or stray inode. - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (con && mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *in << endl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(df, in->ino(), in->get_replica_nonce()); - } - } - - // unlink - if (dn) - dn->get_dir()->unlink_inode(dn); - remove_inode(in); -} - - -/** - * trim_non_auth - remove any non-auth items from our cache - * - * this reduces the amount of non-auth metadata in our cache, reducing the - * load incurred by the rejoin phase. - * - * the only non-auth items that remain are those that are needed to - * attach our own subtrees to the root. - * - * why we have to do this: - * we may not have accurate linkage for non-auth items. which means we will - * know which subtree it falls into, and can not be sure to declare it to the - * correct authority. - */ -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << endl; - - // note first auth item we see. - // when we see it the second time, stop. - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << endl; - if (dn->is_remote()) { - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - CInode *in = dn->get_inode(); - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *subdir = *p; - if (subdir->is_subtree_root()) - remove_subtree(subdir); - in->close_dirfrag(subdir->dirfrag().frag); - } - dir->unlink_inode(dn); - remove_inode(in); - } - else { - assert(dn->is_null()); - } - dir->remove_dentry(dn); - - // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! - } - } - - if (lru.lru_get_size() == 0) { - // root, stray, etc.? - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator next = p; - ++next; - CInode *in = p->second; - if (!in->is_auth()) { - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - assert((*p)->get_num_ref() == 0); - remove_subtree((*p)); - in->close_dirfrag((*p)->dirfrag().frag); - } - assert(in->get_num_ref() == 0); - remove_inode(in); - } - p = next; - } - } - - show_subtrees(); -} - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - - dout(7) << "cache_expire from mds" << from << endl; - - // loop over realms - for (map::iterator p = m->realms.begin(); - p != m->realms.end(); - ++p) { - // check container? - if (p->first.ino > 0) { - CDir *con = get_dirfrag(p->first); - assert(con); // we had better have this. - - if (!con->is_auth() || - (con->is_auth() && con->is_exporting() && - migrator->get_export_state(con) == Migrator::EXPORT_WARNING && - migrator->export_has_warned(con,from))) { - // not auth. - dout(7) << "delaying nonauth|warned expires for " << *con << endl; - assert(con->is_frozen_tree_root()); - - // make a message container - if (delayed_expire[con].count(from) == 0) - delayed_expire[con][from] = new MCacheExpire(from); - - // merge these expires into it - delayed_expire[con][from]->add_realm(p->first, p->second); - continue; - } - dout(7) << "expires for " << *con << endl; - } else { - dout(7) << "containerless expires (root, stray inodes)" << endl; - } - - // INODES - for (map::iterator it = p->second.inodes.begin(); - it != p->second.inodes.end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << " inode expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(in); - } - assert(in->is_auth()); - - // check nonce - if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; - inode_remove_replica(in, from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << endl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // DIRS - for (map::iterator it = p->second.dirs.begin(); - it != p->second.dirs.end(); - it++) { - CDir *dir = get_dirfrag(it->first); - int nonce = it->second; - - if (!dir) { - dout(0) << " dir expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(dir); - } - assert(dir->is_auth()); - - // check nonce - if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replicas << endl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << endl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // DENTRIES - for (map >::iterator pd = p->second.dentries.begin(); - pd != p->second.dentries.end(); - ++pd) { - dout(0) << " dn expires in dir " << pd->first << endl; - CDir *dir = get_dirfrag(pd->first); - - if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from << ", don't have it" << endl; - assert(dir); - } - assert(dir->is_auth()); - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - - CDentry *dn = dir->lookup(p->first); - if (!dn) - dout(0) << " missing dentry for " << p->first << " in " << *dir << endl; - assert(dn); - - if (nonce == dn->get_replica_nonce(from)) { - dout(7) << " dentry_expire on " << *dn << " from mds" << from << endl; - dentry_remove_replica(dn, from); - } - else { - dout(7) << " dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << endl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - } - - - // done - delete m; -} - -void MDCache::process_delayed_expire(CDir *dir) -{ - dout(7) << "process_delayed_expire on " << *dir << endl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - handle_cache_expire(p->second); - delayed_expire.erase(dir); -} - -void MDCache::discard_delayed_expire(CDir *dir) -{ - dout(7) << "discard_delayed_expire on " << *dir << endl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - delete p->second; - delayed_expire.erase(dir); -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->authlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->authlock); - if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) mds->locker->file_eval_gather(&in->filelock); - if (in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); - - // alone now? - /* - if (!in->is_replicated()) { - mds->locker->simple_eval_gather(&in->authlock); - mds->locker->simple_eval_gather(&in->linklock); - mds->locker->simple_eval_gather(&in->dirfragtreelock); - mds->locker->file_eval_gather(&in->filelock); - mds->locker->scatter_eval_gather(&in->dirlock); - } - */ -} - -void MDCache::dentry_remove_replica(CDentry *dn, int from) -{ - dn->remove_replica(from); - - // fix lock - if (dn->lock.remove_replica(from) || - !dn->is_replicated()) - mds->locker->simple_eval_gather(&dn->lock); -} - - - -// ========================================================================================= -// shutdown - -class C_MDC_ShutdownCommit : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCommit(MDCache *mdc) { - this->mdc = mdc; - } - void finish(int r) { - mdc->shutdown_commits--; - } -}; - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << endl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << endl; - dout(0) << "log len " << mds->mdlog->get_num_events() << endl; - - - if (mds->filer->is_active()) - dout(0) << "filer still active" << endl; -} - -void MDCache::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << endl; - - if (mds->is_stopped()) { - dout(7) << " already shut down" << endl; - show_cache(); - show_subtrees(); - return true; - } - - // commit dirs? - if (g_conf.mds_commit_on_shutdown) { - - if (shutdown_commits < 0) { - dout(1) << "shutdown_pass committing all dirty dirs" << endl; - shutdown_commits = 0; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - CInode *in = it->second; - if (!in->is_dir()) continue; - - // commit any dirty dirfrag that's ours - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (dir->is_auth() && dir->is_dirty()) { - dir->commit(0, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } - } - } - } - - // commits? - if (shutdown_commits > 0) { - dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl; - return false; - } - } - - // flush anything we can from the cache - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << endl; - - // flush batching eopens, so that we can properly expire them. - mds->server->journal_opens(); // hrm, this is sort of a hack. - - // flush what we can from the log - mds->mdlog->trim(0); - - // SUBTREES - // send all imports back to 0. - if (!subtrees.empty() && - mds->get_nodeid() != 0 && - !migrator->is_exporting() //&& - //!migrator->is_importing() - ) { - // export to root - dout(7) << "looking for subtrees to export to mds0" << endl; - list ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - it++) { - CDir *dir = it->first; - if (dir->get_inode()->is_stray()) continue; - if (dir->is_frozen() || dir->is_freezing()) continue; - if (!dir->is_full_dir_auth()) continue; - ls.push_back(dir); - } - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - dout(7) << "sending " << *dir << " back to mds0" << endl; - migrator->export_dir(dir, 0); - } - } - - // subtrees map not empty yet? - if (!subtrees.empty()) { - dout(7) << "still have " << num_subtrees() << " subtrees" << endl; - show_subtrees(); - migrator->show_importing(); - migrator->show_exporting(); - //show_cache(); - return false; - } - assert(subtrees.empty()); - assert(!migrator->is_exporting()); - assert(!migrator->is_importing()); - - - // empty out stray contents - // FIXME - dout(7) << "FIXME: i need to empty out stray dir contents..." << endl; - - // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown) { - if (mds->mdlog->get_non_subtreemap_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << endl; - return false; - } - } - - // cap log? - if (g_conf.mds_log_flush_on_shutdown) { - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << endl; - mds->mdlog->cap(); - // note that this won't flush right away, so we'll make at least one more pass - } - - if (mds->mdlog->get_num_events()) { - dout(7) << "waiting for log to flush (including subtree_map, now) .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << endl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << endl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << endl; - return false; - } - - - // done? - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(1) << "shutdown done." << endl; - return true; -} - - - - - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - // RESOLVE - case MSG_MDS_RESOLVE: - handle_resolve((MMDSResolve*)m); - break; - case MSG_MDS_RESOLVEACK: - handle_resolve_ack((MMDSResolveAck*)m); - break; - - // REJOIN - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - - - - default: - dout(7) << "cache unknown message " << m->get_type() << endl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT, etc.) - * 0 : success - * >0 : delayed or forwarded - * - * onfail values: - * - * MDS_TRAVERSE_FORWARD - forward to auth (or best guess) - * MDS_TRAVERSE_DISCOVER - discover missing items. skip permission checks. - * MDS_TRAVERSE_DISCOVERXLOCK - discover XLOCKED items too (be careful!). - * MDS_TRAVERSE_FAIL - return an error - */ - -Context *MDCache::_get_waiter(MDRequest *mdr, Message *req) -{ - if (mdr) - return new C_MDS_RetryRequest(this, mdr); - else - return new C_MDS_RetryMessage(mds, req); -} - -int MDCache::path_traverse(MDRequest *mdr, Message *req, // who - CInode *base, filepath& origpath, // what - vector& trace, // result - bool follow_trailing_symlink, // how - int onfail) -{ - assert(mdr || req); - bool null_okay = onfail == MDS_TRAVERSE_DISCOVERXLOCK; - bool noperm = false; - if (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK) - noperm = true; - - // keep a list of symlinks we touch to avoid loops - set< pair > symlinks_resolved; - - // root - CInode *cur = base; - if (!cur) cur = get_root(); - if (cur == NULL) { - dout(7) << "traverse: i don't have root" << endl; - open_root(_get_waiter(mdr, req)); - return 1; - } - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl; - - // ENOTDIR? - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << endl; - return -ENOTDIR; - } - - // open dir - frag_t fg = cur->pick_dirfrag(path[depth]); - CDir *curdir = cur->get_dirfrag(fg); - if (!curdir) { - if (cur->is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - return 1; - } - - curdir = cur->get_or_open_dirfrag(this, fg); - } else { - // discover? - assert(!cur->is_auth()); - if (cur->is_ambiguous_auth()) { - dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl; - cur->add_waiter(CInode::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } else if (dir_discovers.count(cur->ino())) { - dout(10) << "traverse: need dir, already doing discover for " << *cur << endl; - assert(cur->is_waiter_for(CInode::WAIT_DIR)); - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir, doing discover, want " << want.get_path() - << " from " << *cur << endl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true, // need this dir! - onfail == MDS_TRAVERSE_DISCOVERXLOCK), - cur->authority().first, MDS_PORT_CACHE); - dir_discovers[cur->ino()].insert(cur->authority().first); - } - cur->add_waiter(CInode::WAIT_DIR, _get_waiter(mdr, req)); - return 1; - } - } - assert(curdir); - - // frozen? - /* - if (curdir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *curdir << " is frozen, waiting" << endl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && - !mds->locker->simple_rdlock_try(&cur->authlock, _get_waiter(mdr, req))) - return 1; - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << endl; - continue; - } - - - // dentry - CDentry *dn = curdir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && null_okay) { - dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << endl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { - dout(10) << "traverse: xlocked dentry at " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << endl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; - assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dn->get_remote_ino(), mdr, _get_waiter(mdr, req)); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << endl; - dout(10) << "traverse: path tail = " << tail << endl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << endl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = tail; - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl; - } - continue; - } - - // forwarder wants replicas? - if (mdr && mdr->client_request && - mdr->client_request->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " - << mdr->client_request->get_mds_wants_replica_in_dirino() - << " vs " << curdir->dirfrag() << endl; - - if (mdr->client_request->get_mds_wants_replica_in_dirino() == curdir->ino() && - curdir->is_auth() && - curdir->is_rep() && - curdir->is_replica(req->get_source().num()) && - dn->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << endl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; - MDiscoverReply *reply = new MDiscoverReply(curdir->ino()); - reply->add_dentry( dn->replicate_to( from ) ); - if (dn->is_primary()) - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - // add to trace, continue. - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - - // MISS. dentry doesn't exist. - dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << endl; - - if (curdir->is_auth()) { - // dentry is mine. - if (curdir->is_complete()) { - // file not found - return -ENOENT; - } else { - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; - touch_inode(cur); - curdir->fetch(_get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("cmiss"); - return 1; - } - } else { - // dirfrag/dentry is not mine. - pair dauth = curdir->authority(); - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover? - filepath want = path.postfixpath(depth); - - if (curdir->is_waiting_for_dentry(path[depth])) { - dout(7) << "traverse: already waiting for discover " << want.get_path() - << " from " << *curdir << endl; - } - else if (curdir->is_ambiguous_auth()) { - dout(7) << "traverse: waiting for single auth on " << *curdir << endl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } - else { - dout(7) << "traverse: discover " << want << " from " << *curdir << endl; - touch_inode(cur); - - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - false, - onfail == MDS_TRAVERSE_DISCOVERXLOCK), - dauth.first, MDS_PORT_CACHE); - if (mds->logger) mds->logger->inc("dis"); - } - - // delay processing of current request. - curdir->add_dentry_waiter(path[depth], _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("cmiss"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " in " << *curdir << endl; - - if (curdir->is_ambiguous_auth()) { - // wait - dout(7) << "traverse: waiting for single auth in " << *curdir << endl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } else { - dout(7) << "traverse: forwarding, not auth for " << *curdir << endl; - - // request replication? - if (mdr && mdr->client_request && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " - << *curdir << " req " << *(MClientRequest*)req << endl; - mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - if (mdr) - request_forward(mdr, dauth.first, req->get_dest_port()); - else - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); - - if (mds->logger) mds->logger->inc("cfw"); - return 2; - } - } - if (onfail == MDS_TRAVERSE_FAIL) { - return -ENOENT; // not necessarily exactly true.... - } - } - - assert(0); // i shouldn't get here - } - - // success. - return 0; -} - -bool MDCache::path_is_mine(filepath& path) -{ - dout(15) << "path_is_mine " << path << endl; - - // start at root. FIXME. - CInode *cur = root; - assert(cur); - - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_dirfrag(fg); - if (!dir) return cur->is_auth(); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir->is_auth(); - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return cur->is_auth(); -} - -/** - * path_traverse_to_dir -- traverse to deepest dir we have - * - * @path - path to traverse (as far as we can) - * - * assumes we _don't_ have the full path. (if we do, we return NULL.) - */ -CDir *MDCache::path_traverse_to_dir(filepath& path) -{ - CInode *cur = root; - assert(cur); - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_or_open_dirfrag(this, fg); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir; - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return NULL; // oh, we have the full path. -} - - - -void MDCache::open_remote_dir(CInode *diri, frag_t fg, Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << endl; - - assert(diri->is_dir()); - assert(!diri->is_auth()); - assert(diri->get_dirfrag(fg) == 0); - - int auth = diri->authority().first; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - // discover it - filepath want; // no dentries, i just want the dir open - MDiscover *dis = new MDiscover(mds->get_nodeid(), - diri->ino(), - want, - true); // need the base dir open - dis->set_base_dir_frag(fg); - mds->send_message_mds(dis, auth, MDS_PORT_CACHE); - dir_discovers[diri->ino()].insert(auth); - diri->add_waiter(CInode::WAIT_DIR, fin); - } else { - // mds is down or recovering. forge a replica! - forge_replica_dir(diri, fg, auth); - } -} - - -/** - * get_dentry_inode - get or open inode - * - * @dn the dentry - * @mdr current request - * - * will return inode for primary, or link up/open up remote link's inode as necessary. - */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) -{ - assert(!dn->is_null()); - - if (dn->is_primary()) - return dn->inode; - - assert(dn->is_remote()); - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "get_dentry_inode linking in remote in " << *in << endl; - dn->link_remote(in); - return in; - } else { - dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << endl; - open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); - return 0; - } -} - - -class C_MDC_OpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - vector anchortrace; - - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, vector& at, - MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c), anchortrace(at) {} - - void finish(int r) { - assert(r == 0); - if (r == 0) - mdcache->open_remote_ino_2(ino, mdr, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - MDRequest *mdr, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << endl; - - C_MDC_OpenRemoteIno *c = new C_MDC_OpenRemoteIno(this, ino, mdr, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - MDRequest *mdr, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino - << ", trace depth is " << anchortrace.size() << endl; - - // find deepest cached inode in prefix - unsigned i = anchortrace.size(); // i := array index + 1 - CInode *in = 0; - while (1) { - // inode? - dout(10) << " " << i << ": " << anchortrace[i-1] << endl; - in = get_inode(anchortrace[i-1].ino); - if (in) break; - i--; - if (!i) { - in = get_inode(anchortrace[i].dirfrag.ino); - assert(in); // actually, we may need to open the root or a foreign stray inode, here. - break; - } - } - dout(10) << "deepest cached inode at " << i << " is " << *in << endl; - - if (in->ino() == ino) { - // success - dout(10) << "open_remote_ino_2 have " << *in << endl; - onfinish->finish(0); - delete onfinish; - return; - } - - // open dirfrag beneath *in - frag_t frag = anchortrace[i].dirfrag.frag; - - if (!in->dirfragtree.contains(frag)) { - dout(10) << "frag " << frag << " not valid, requerying anchortable" << endl; - open_remote_ino(ino, mdr, onfinish); - return; - } - - if (!in->is_auth()) { - dout(10) << "opening remote dirfrag " << frag << " under " << *in << endl; - open_remote_dir(in, frag, - new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - return; - } - - CDir *dir = in->get_or_open_dirfrag(this, frag); - assert(dir); - if (dir->is_auth()) { - if (dir->is_complete()) { - // hrm. requery anchor table. - dout(10) << "expected ino " << anchortrace[i].ino - << " in complete dir " << *dir - << ", requerying anchortable" - << endl; - open_remote_ino(ino, mdr, onfinish); - } else { - dout(10) << "need ino " << anchortrace[i].ino - << ", fetching incomplete dir " << *dir - << endl; - dir->fetch(new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } - } else { - // hmm, discover. - dout(10) << "have remote dirfrag " << *dir << ", discovering " - << anchortrace[i].ino << endl; - - MDiscover *dis = new MDiscover(mds->get_nodeid(), - dir->dirfrag(), - anchortrace[i].ino, - true); // being conservative here. - mds->send_message_mds(dis, dir->authority().first, MDS_PORT_CACHE); - } -} - - - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << endl; - trace.push_back(dn); - } -} - - -MDRequest *MDCache::request_start(MClientRequest *req) -{ - // did we win a forward race against a slave? - if (active_requests.count(req->get_reqid())) { - MDRequest *mdr = active_requests[req->get_reqid()]; - dout(10) << "request_start already had " << *mdr << ", cleaning up" << endl; - assert(mdr->is_slave()); - request_cleanup(mdr); - delete mdr; - } - - // register new client request - MDRequest *mdr = new MDRequest(req->get_reqid(), req); - active_requests[req->get_reqid()] = mdr; - dout(7) << "request_start " << *mdr << endl; - return mdr; -} - -MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) -{ - MDRequest *mdr = new MDRequest(ri, by); - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_slave " << *mdr << " by mds" << by << endl; - return mdr; -} - - -MDRequest *MDCache::request_get(metareqid_t rid) -{ - assert(active_requests.count(rid)); - dout(7) << "request_get " << rid << " " << *active_requests[rid] << endl; - return active_requests[rid]; -} - -void MDCache::request_finish(MDRequest *mdr) -{ - dout(7) << "request_finish " << *mdr << endl; - - // slave finisher? - if (mdr->slave_commit) { - mdr->slave_commit->finish(0); - delete mdr->slave_commit; - mdr->slave_commit = 0; - } - - delete mdr->client_request; - delete mdr->slave_request; - request_cleanup(mdr); - - if (mds->logger) mds->logger->inc("reply"); -} - - -void MDCache::request_forward(MDRequest *mdr, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - dout(7) << "request_forward " << *mdr << " to mds" << who << " req " << *mdr << endl; - - mds->forward_message_mds(mdr->client_request, who, port); - request_cleanup(mdr); - - if (mds->logger) mds->logger->inc("fw"); -} - - -void MDCache::dispatch_request(MDRequest *mdr) -{ - if (mdr->client_request) { - mds->server->dispatch_client_request(mdr); - } else if (mdr->slave_request) { - mds->server->dispatch_slave_request(mdr); - } else - assert(0); -} - - - -void MDCache::request_forget_foreign_locks(MDRequest *mdr) -{ - // xlocks - set::iterator p = mdr->xlocks.begin(); - while (p != mdr->xlocks.end()) { - if ((*p)->get_parent()->is_auth()) - p++; - else { - dout(10) << "request_forget_foreign_locks " << **p - << " on " << *(*p)->get_parent() << endl; - (*p)->put_xlock(); - mdr->locks.erase(*p); - mdr->xlocks.erase(p++); - } - } -} - -void MDCache::request_cleanup(MDRequest *mdr) -{ - dout(15) << "request_cleanup " << *mdr << endl; - metareqid_t ri = mdr->reqid; - - // clear ref, trace - mdr->ref = 0; - mdr->trace.clear(); - - // clean up slaves - // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->slaves.begin(); - p != mdr->slaves.end(); - ++p) { - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); - mds->send_message_mds(r, *p, MDS_PORT_SERVER); - } - // strip foreign xlocks out of lock lists, since the OP_FINISH drops them implicitly. - request_forget_foreign_locks(mdr); - - - // drop locks - mds->locker->drop_locks(mdr); - - // drop (local) auth pins - mdr->drop_local_auth_pins(); - - // drop cache pins - for (set::iterator it = mdr->pins.begin(); - it != mdr->pins.end(); - it++) - (*it)->put(MDSCacheObject::PIN_REQUEST); - mdr->pins.clear(); - - // remove from map - active_requests.erase(mdr->reqid); - delete mdr; - - - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - - -// -------------------------------------------------------------------- -// ANCHORS - -// CREATE - -class C_MDC_AnchorCreatePrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorCreatePrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_create_prepared(in, atid); - } -}; - -void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin() && - !mdr->is_auth_pinned(in)) { - dout(7) << "anchor_create not authpinnable, waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); - return; - } - - // wait - in->add_waiter(CInode::WAIT_ANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_create already anchoring " << *in << endl; - return; - } - - dout(7) << "anchor_create " << *in << endl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - in->auth_pin(); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - C_MDC_AnchorCreatePrepared *fin = new C_MDC_AnchorCreatePrepared(this, in); - mds->anchorclient->prepare_create(in->ino(), trace, &fin->atid, fin); -} - -class C_MDC_AnchorCreateLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - version_t pdv; -public: - C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, version_t v) : - cache(c), in(i), atid(t), pdv(v) {} - void finish(int r) { - cache->_anchor_create_logged(in, atid, pdv); - } -}; - -void MDCache::_anchor_create_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << endl; - assert(in->inode.anchored == false); - - // predirty, prepare log entry - version_t pdv = in->pre_dirty(); - - EUpdate *le = new EUpdate("anchor_create"); - le->metablob.add_dir_context(in->get_parent_dir()); - - // update the logged inode copy - inode_t *pi = le->metablob.add_dentry(in->parent, true); - pi->anchored = true; - pi->version = pdv; - - // note anchor transaction - le->metablob.add_anchor_transaction(atid); - - // log + wait - mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, pdv)); -} - - -void MDCache::_anchor_create_logged(CInode *in, version_t atid, version_t pdv) -{ - dout(10) << "_anchor_create_logged pdv " << pdv << " on " << *in << endl; - - // unpin - assert(in->state_test(CInode::STATE_ANCHORING)); - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - in->auth_unpin(); - - // apply update to cache - in->inode.anchored = true; - in->mark_dirty(pdv); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid); - - // trigger waiters - in->finish_waiting(CInode::WAIT_ANCHORED, 0); -} - - -// DESTROY - -class C_MDC_AnchorDestroyPrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorDestroyPrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_destroy_prepared(in, atid); - } -}; - -void MDCache::anchor_destroy(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin()/* && - !mdr->is_auth_pinned(in)*/) { - dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); - return; - } - - // wait - if (onfinish) - in->add_waiter(CInode::WAIT_UNANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_UNANCHORING)) { - dout(7) << "anchor_destroy already unanchoring " << *in << endl; - return; - } - - dout(7) << "anchor_destroy " << *in << endl; - - // auth: do it - in->state_set(CInode::STATE_UNANCHORING); - in->get(CInode::PIN_UNANCHORING); - in->auth_pin(); - - // do it - C_MDC_AnchorDestroyPrepared *fin = new C_MDC_AnchorDestroyPrepared(this, in); - mds->anchorclient->prepare_destroy(in->ino(), &fin->atid, fin); -} - -class C_MDC_AnchorDestroyLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - version_t pdv; -public: - C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, version_t v) : - cache(c), in(i), atid(t), pdv(v) {} - void finish(int r) { - cache->_anchor_destroy_logged(in, atid, pdv); - } -}; - -void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_destroy_prepared " << *in << " atid " << atid << endl; - - assert(in->inode.anchored == true); - - // predirty, prepare log entry - version_t pdv = in->pre_dirty(); - - EUpdate *le = new EUpdate("anchor_destroy"); - le->metablob.add_dir_context(in->get_parent_dir()); - - // update the logged inode copy - inode_t *pi = le->metablob.add_dentry(in->parent, true); - pi->anchored = true; - pi->version = pdv; - - // note anchor transaction - le->metablob.add_anchor_transaction(atid); - - // log + wait - mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, pdv)); -} - - -void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, version_t pdv) -{ - dout(10) << "_anchor_destroy_logged pdv " << pdv << " on " << *in << endl; - - // unpin - assert(in->state_test(CInode::STATE_UNANCHORING)); - in->state_clear(CInode::STATE_UNANCHORING); - in->put(CInode::PIN_UNANCHORING); - in->auth_unpin(); - - // apply update to cache - in->inode.anchored = false; - in->inode.version = pdv; - - // tell the anchortable we've committed - mds->anchorclient->commit(atid); - - // trigger waiters - in->finish_waiting(CInode::WAIT_UNANCHORED, 0); -} - - -// ------------------------------------------------------------------------------- -// STRAYS - -void MDCache::eval_stray(CDentry *dn) -{ - dout(10) << "eval_stray " << *dn << endl; - assert(dn->is_primary()); - CInode *in = dn->inode; - assert(in); - - // purge? - if (in->inode.nlink == 0) { - if (!dn->is_replicated() && !in->is_any_caps()) - _purge_stray(dn); - return; - } - else if (in->inode.nlink == 1) { - // trivial reintegrate? - if (!in->remote_parents.empty()) { - CDentry *rlink = *in->remote_parents.begin(); - if (rlink->is_auth() && - rlink->dir->can_auth_pin()) - reintegrate_stray(dn, rlink); - - if (!rlink->is_auth() && - !in->is_ambiguous_auth()) - migrate_stray(dn, rlink->authority().first); - } - } else { - // wait for next use. - } -} - - -class C_MDC_PurgeStray : public Context { - MDCache *cache; - CDentry *dn; - version_t pdv; -public: - C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v) : cache(c), dn(d), pdv(v) { } - void finish(int r) { - cache->_purge_stray_logged(dn, pdv); - } -}; - -void MDCache::_purge_stray(CDentry *dn) -{ - dout(10) << "_purge_stray " << *dn << " " << *dn->inode << endl; - assert(!dn->is_replicated()); - - // log removal - version_t pdv = dn->pre_dirty(); - - EUpdate *le = new EUpdate("purge_stray"); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_null_dentry(dn, true); - le->metablob.add_inode_truncate(dn->inode->inode, 0); - mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv)); -} - -void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv) -{ - dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << endl; - CInode *in = dn->inode; - - // dirty+unlink dentry - dn->dir->mark_dirty(pdv); - dn->dir->unlink_inode(dn); - dn->dir->remove_dentry(dn); - - // purge+remove inode - purge_inode(&in->inode, 0); - remove_inode(in); -} - - - -void MDCache::reintegrate_stray(CDentry *dn, CDentry *rlink) -{ - dout(10) << "reintegrate_stray " << *dn << " into " << *rlink << endl; - -} - - -void MDCache::migrate_stray(CDentry *dn, int dest) -{ - dout(10) << "migrate_stray to mds" << dest << " " << *dn << endl; - -} - - - - -// REPLICAS - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - assert(dis->get_asker() != whoami); - - CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis->get_base_ino()); - - // get started. - if (dis->get_base_ino() == MDS_INO_ROOT) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants root + " << dis->get_want().get_path() << endl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - // add root - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << endl; - - cur = root; - } - else if (dis->get_base_ino() == MDS_INO_STRAY(whoami)) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants stray + " << dis->get_want().get_path() << endl; - - reply->add_inode( stray->replicate_to( dis->get_asker() ) ); - dout(10) << "added stray " << *stray << endl; - - cur = stray; - } - else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - - if (!cur) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " don't have base ino " << dis->get_base_ino() - << ", dropping" << endl; - delete reply; - return; - } - - if (dis->wants_base_dir()) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur - << " wants basedir+" << dis->get_want().get_path() - << endl; - } else { - dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur - << " wants " << dis->get_want().get_path() - << endl; - } - } - - assert(reply); - assert(cur); - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; - i < dis->get_want().depth() || dis->get_want().depth() == 0; - i++) { - - // -- figure out the dir - - // is *cur even a dir at all? - if (!cur->is_dir()) { - dout(7) << *cur << " not a dir" << endl; - reply->set_flag_error_dir(); - break; - } - - // pick frag - frag_t fg; - if (dis->get_want().depth()) { - // dentry specifies - fg = cur->pick_dirfrag(dis->get_dentry(i)); - } else { - // requester explicity specified the frag - fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_base_ino() < MDS_INO_BASE); - } - CDir *curdir = cur->get_dirfrag(fg); - - // am i dir auth (or if no dir, at least the inode auth) - if ((!curdir && !cur->is_auth()) || - (curdir && !curdir->is_auth())) { - if (curdir) { - dout(7) << *curdir << " not dirfrag auth, setting dir_auth_hint" << endl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << *cur << " dirfrag not open, not inode auth, setting dir_auth_hint" << endl; - reply->set_dir_auth_hint(cur->authority().first); - } - reply->set_wanted_xlocks_hint(dis->wants_xlocked()); - - // set hint (+ dentry, if there is one) - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); - break; - } - - // open dir? - if (!curdir) - curdir = cur->get_or_open_dirfrag(this, fg); - assert(curdir); - assert(curdir->is_auth()); - - // is dir frozen? - if (curdir->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *curdir << " is frozen, empty reply, waiting" << endl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *curdir << " is frozen, non-empty reply, stopping" << endl; - break; - } - } - - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "handle_discover not adding unwanted base dir " << *curdir << endl; - } else { - assert(!curdir->is_ambiguous_auth()); // would be frozen. - reply->add_dir( curdir->replicate_to(dis->get_asker()) ); - dout(7) << "handle_discover added dir " << *curdir << endl; - } - if (dis->get_want().depth() == 0) break; - - // lookup inode? - CDentry *dn = 0; - if (dis->get_want_ino()) { - CInode *in = get_inode(dis->get_want_ino()); - if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) - dn = in->get_parent_dn(); - } else { - // lookup dentry - dn = curdir->lookup( dis->get_dentry(i) ); - } - - // incomplete dir? - if (!dn) { - if (!curdir->is_complete()) { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << endl; - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis)); - return; - } else { - // initiate fetch, but send what we have so far - curdir->fetch(0); - break; - } - } - - // don't have wanted ino in this dir? - if (dis->get_want_ino()) { - // set error flag in reply - dout(7) << "ino " << dis->get_want_ino() << " in this dir, flagging error in " - << *curdir << endl; - reply->set_flag_error_ino(); - break; - } - - // send null dentry - dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " - << *curdir << endl; - dn = curdir->add_dentry(dis->get_dentry(i), 0); - } - assert(dn); - - // xlocked dentry? - // ...always block on non-tail items (they are unrelated) - // ...allow xlocked tail disocvery _only_ if explicitly requested - if (dn->lock.is_xlocked()) { - // is this the last (tail) item in the discover traversal? - bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << endl; - } else { - dout(7) << "handle_discover blocking on xlocked " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } - } - - // add dentry - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added dentry " << *dn << endl; - - if (!dn->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dn->inode; - assert(next->is_auth()); - - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added inode " << *next << endl; - - // descend, keep going. - cur = next; - continue; - } - - // how did we do? - if (reply->is_empty()) { - dout(7) << "handle_discover dropping this empty reply)." << endl; - delete reply; - } else { - dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << endl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - } - - // done. - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - // starting point - list finished, error; - - // grab base inode - CInode *cur = get_inode(m->get_base_ino()); - - if (cur) { - dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; - } - else if (m->get_base_ino() == MDS_INO_ROOT) { - // it's the root inode. - assert(!root); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; - - // add in root - cur = add_replica_inode(m->get_inode(0), NULL); - cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - set_root(cur); - dout(7) << "discover_reply got root " << *cur << endl; - - // take root waiters - finished.swap(waiting_for_root); - } - else if (MDS_INO_IS_STRAY(m->get_base_ino())) { - dout(7) << "discover_reply stray + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; - - // add - cur = add_replica_inode(m->get_inode(0), NULL); - cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - - dout(7) << "discover_reply got stray " << *cur << endl; - - // take waiters - finished.swap(waiting_for_stray[cur->ino()]); - waiting_for_stray.erase(cur->ino()); - } - - // fyi - if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; - if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; - dout(10) << "depth = " << m->get_depth() - << ", has base_dir/base_dn/root = " - << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() - << ", num dirs/dentries/inodes = " - << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() - << endl; - - // loop over discover results. - // indexese follow each ([[dir] dentry] inode) - // can start, end with any type. - - for (int i=m->has_base_inode(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; - - // dir - frag_t fg; - CDir *curdir = 0; - if (i > 0 || m->has_base_dir()) { - assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); - fg = m->get_dir(i).get_dirfrag().frag; - - // add/update the dir replica - curdir = add_replica_dir(cur, fg, m->get_dir(i), - m->get_source().num(), - finished); - } - if (!curdir) { - fg = cur->pick_dirfrag(m->get_dentry(i).get_dname()); - curdir = cur->get_dirfrag(fg); - } - - // dentry error? - if (i == m->get_depth()-1 && - m->is_flag_error_dn()) { - // error! - assert(cur->is_dir()); - if (curdir) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl; - curdir->take_dentry_waiting(m->get_error_dentry(), - error); - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl; - cur->take_waiting(CInode::WAIT_DIR, error); - dir_discovers.erase(cur->ino()); - } - break; - } - - assert(curdir); - - // dentry - CDentry *dn = 0; - if (i >= m->get_last_dentry()) break; - if (i > 0 || m->has_base_dentry()) { - dn = add_replica_dentry(curdir, m->get_dentry(i), finished); - } - - // inode - if (i >= m->get_last_inode()) break; - cur = add_replica_inode(m->get_inode(i), dn); - } - - // dir_auth hint? - if (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && - m->get_dir_auth_hint() != mds->get_nodeid()) { - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << endl; - - // try again. include dentry _and_ dirfrag, just in case. - int hint = m->get_dir_auth_hint(); - filepath want; - want.push_dentry(m->get_error_dentry()); - MDiscover *dis = new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true, - m->get_wanted_xlocks_hint()); - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - dis->set_base_dir_frag(fg); - mds->send_message_mds(dis, hint, MDS_PORT_CACHE); - - // note the dangling discover... but only if it's already noted in dir_discovers (i.e. someone is waiting) - if (dir_discovers.count(cur->ino())) { - dir_discovers[cur->ino()].insert(hint); - assert(cur->is_waiter_for(CInode::WAIT_DIR)); - } - } - else if (m->is_flag_error_dir()) { - // dir error at the end there? - dout(7) << " flag_error on dir " << *cur << endl; - assert(!cur->is_dir()); - cur->take_waiting(CInode::WAIT_DIR, error); - dir_discovers.erase(cur->ino()); - } - - // finish errors directly - finish_contexts(error, -ENOENT); - mds->queue_waiters(finished); - - // done - delete m; -} - - -CDir *MDCache::add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover &dis, int from, - list& finished) -{ - // add it (_replica_) - CDir *dir = diri->get_dirfrag(fg); - - if (dir) { - // had replica. update w/ new nonce. - dis.update_dir(dir); - dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << endl; - } else { - // add replica. - dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - dis.update_dir(dir); - - // is this a dir_auth delegation boundary? - if (from != diri->authority().first || - diri->is_ambiguous_auth() || - diri->ino() < MDS_INO_BASE) - adjust_subtree_auth(dir, from); - - dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << endl; - - // get waiters - diri->take_waiting(CInode::WAIT_DIR, finished); - dir_discovers.erase(diri->ino()); - } - - return dir; -} - -CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, int from) -{ - assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN); - - // forge a replica. - CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - - // i'm assuming this is a subtree root. - adjust_subtree_auth(dir, from); - - dout(7) << "forge_replica_dir added " << *dir << " while mds" << from << " is down" << endl; - - return dir; -} - -CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished) -{ - CDentry *dn = dir->lookup( dis.get_dname() ); - - // have it? - if (dn) { - dis.update_dentry(dn); - dout(7) << "add_replica_dentry had " << *dn << endl; - } else { - dn = dir->add_dentry( dis.get_dname(), 0 ); - dis.update_dentry(dn); - dis.init_dentry_lock(dn); - dout(7) << "add_replica_dentry added " << *dn << endl; - } - - // remote_ino linkage? - if (dis.get_remote_ino()) { - if (dn->is_null()) - dir->link_inode(dn, dis.get_remote_ino()); - - // hrm. yeah. - assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino()); - } - - dir->take_dentry_waiting(dis.get_dname(), finished); - - return dn; -} - -CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn) -{ - CInode *in = get_inode(dis.get_ino()); - if (!in) { - in = new CInode(this, false); - dis.update_inode(in); - dis.init_inode_locks(in); - add_inode(in); - dout(10) << "add_replica_inode had " << *in << endl; - if (dn && dn->is_null()) - dn->dir->link_inode(dn, in); - } else { - dis.update_inode(in); - dout(10) << "add_replica_inode added " << *in << endl; - } - - if (dn) { - assert(dn->is_primary()); - assert(dn->inode == in); - } - - return in; -} - - -CDentry *MDCache::add_replica_stray(bufferlist &bl, CInode *in, int from) -{ - list finished; - int off = 0; - - // inode - CInodeDiscover indis; - indis._decode(bl, off); - CInode *strayin = add_replica_inode(indis, NULL); - strayin->force_auth = pair(from, CDIR_AUTH_UNKNOWN); - dout(15) << "strayin " << *strayin << endl; - - // dir - CDirDiscover dirdis; - dirdis._decode(bl, off); - CDir *straydir = add_replica_dir(strayin, dirdis.get_dirfrag().frag, dirdis, - from, finished); - dout(15) << "straydir " << *straydir << endl; - - // dentry - CDentryDiscover dndis; - dndis._decode(bl, off); - - string straydname; - in->name_stray_dentry(straydname); - CDentry *straydn = add_replica_dentry(straydir, dndis, finished); - - mds->queue_waiters(finished); - - return straydn; -} - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << endl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << endl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - - - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl; - - string path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << endl; - - mds->send_message_mds(new MDirUpdate(dir->dirfrag(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << endl; - - // discover it? - if (m->should_discover()) { - m->tried_discover(); // only once! - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << endl; - - int r = path_traverse(0, m, - 0, path, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - assert(r == 0); - - CInode *in = get_inode(m->get_dirfrag().ino); - assert(in); - open_remote_dir(in, m->get_dirfrag().frag, - new C_MDS_RetryMessage(mds, m)); - return; - } - - delete m; - return; - } - - // update - dout(5) << "dir_update on " << *dir << endl; - dir->dir_rep = m->get_dir_rep(); - dir->dir_rep_by = m->get_dir_rep_by(); - - // done - delete m; -} - - - - - - -// UNLINK - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - - if (!dir) { - dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << endl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << endl; - - // move to stray? - CDentry *straydn = 0; - if (m->strayin) { - list finished; - CInode *in = add_replica_inode(*m->strayin, NULL); - CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, - m->get_source().num(), finished); - straydn = add_replica_dentry(dir, *m->straydn, finished); - if (!finished.empty()) mds->queue_waiters(finished); - } - - // open inode? - if (dn->is_primary()) { - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - assert(straydn); - straydn->dir->link_inode(straydn, in); - } else { - assert(dn->is_remote()); - dn->dir->unlink_inode(dn); - } - assert(dn->is_null()); - - // move to bottom of lru - lru.lru_bottouch(dn); - } - } - - delete m; - return; -} - - - - - - - - - -// ============================================================== -// debug crap - -void MDCache::show_subtrees(int dbl) -{ - //dout(10) << "show_subtrees" << endl; - - if (dbl > g_conf.debug && dbl > g_conf.debug_mds) - return; // i won't print anything. - - if (subtrees.empty()) { - dout(dbl) << "no subtrees" << endl; - return; - } - - // root frags - list rootfrags; - if (root) root->get_dirfrags(rootfrags); - if (stray) stray->get_dirfrags(rootfrags); - dout(15) << "rootfrags " << rootfrags << endl; - - // queue stuff - list > q; - string indent; - set seen; - - // calc max depth - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) - q.push_back(pair(*p, 0)); - - int depth = 0; - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - if (d > depth) depth = d; - - // sanity check - //dout(25) << "saw depth " << d << " " << *dir << endl; - if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << endl; - assert(seen.count(dir) == 0); - seen.insert(dir); - - // nested items? - if (!subtrees[dir].empty()) { - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - //dout(25) << " saw sub " << **p << endl; - q.push_front(pair(*p, d+1)); - } - } - } - - - // print tree - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) - q.push_back(pair(*p, 0)); - - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - // adjust indenter - while ((unsigned)d < indent.size()) - indent.resize(d); - - // pad - string pad = "______________________________________"; - pad.resize(depth*2+1-indent.size()); - if (!subtrees[dir].empty()) - pad[0] = '.'; // parent - - - string auth; - if (dir->is_auth()) - auth = "auth "; - else - auth = " rep "; - - char s[10]; - if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) - sprintf(s, "%2d ", dir->get_dir_auth().first); - else - sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); - - // print - dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << endl; - - if (dir->ino() == MDS_INO_ROOT) - assert(dir->inode == root); - if (dir->ino() == MDS_INO_STRAY(mds->get_nodeid())) - assert(dir->inode == stray); - - // nested items? - if (!subtrees[dir].empty()) { - // more at my level? - if (!q.empty() && q.front().second == d) - indent += "| "; - else - indent += " "; - - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - q.push_front(pair(*p, d+2)); - } - } -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << endl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << endl; - - // dirfrags? - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - dout(7) << " dirfrag " << *dir << endl; - - for (CDir_map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(7) << " dentry " << *dn << endl; - if (dn->is_primary() && dn->inode) - dout(7) << " inode " << *dn->inode << endl; - } - } - } -} - - -void MDCache::dump_cache() -{ - char fn[20]; - sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); - - dout(1) << "dump_cache to " << fn << endl; - - ofstream myfile; - myfile.open(fn); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - myfile << *dir->inode << endl; - myfile << *dir << endl; - - for (CDir_map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - myfile << *dn << endl; - } - } - } - - myfile.close(); -} diff --git a/branches/sage/pgs/mds/MDCache.h b/branches/sage/pgs/mds/MDCache.h deleted file mode 100644 index fcff21976645e..0000000000000 --- a/branches/sage/pgs/mds/MDCache.h +++ /dev/null @@ -1,625 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "include/Context.h" -#include "events/EMetaBlob.h" - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSResolve; -class MMDSResolveAck; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class Message; -class MClientRequest; -class MMDSSlaveRequest; - -// MDCache - -//typedef const char* pchar; - - -struct PVList { - map ls; - - version_t add(MDSCacheObject* o, version_t v) { - return ls[o] = v; - } -}; - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -struct MDRequest { - metareqid_t reqid; - - // -- i am a client (master) request - MClientRequest *client_request; // client request (if any) - set slaves; // mds nodes that have slave requests to me (implies client_request) - set waiting_on_slave; // peers i'm waiting for slavereq replies from. - - vector trace; // original path traversal. - CInode *ref; // reference inode. if there is only one, and its path is pinned. - - // -- i am a slave request - MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) - int slave_to_mds; // this is a slave request if >= 0. - - // -- my pins and locks -- - // cache pins (so things don't expire) - set< MDSCacheObject* > pins; - - // auth pins - set< MDSCacheObject* > auth_pins; - - // held locks - set< SimpleLock* > rdlocks; // always local. - set< SimpleLock* > wrlocks; // always local. - set< SimpleLock* > xlocks; // local or remote. - set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering - - // if this flag is set, do not attempt to acquire further locks. - // (useful for wrlock, which may be a moving auth target) - bool done_locking; - bool committing; - bool aborted; - - // for rename/link/unlink - utime_t now; - set witnessed; // nodes who have journaled a RenamePrepare - map pvmap; - - // for rename - set extra_witnesses; // replica list from srcdn auth (rename) - version_t src_reanchor_atid; // src->dst - version_t dst_reanchor_atid; // dst->stray - bufferlist inode_import; - version_t inode_import_v; - CDentry *srcdn; // srcdn, if auth, on slave - - // called when slave commits - Context *slave_commit; - - - // --------------------------------------------------- - MDRequest() : - client_request(0), ref(0), - slave_request(0), slave_to_mds(-1), - done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } - MDRequest(metareqid_t ri, MClientRequest *req) : - reqid(ri), client_request(req), ref(0), - slave_request(0), slave_to_mds(-1), - done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } - MDRequest(metareqid_t ri, int by) : - reqid(ri), client_request(0), ref(0), - slave_request(0), slave_to_mds(by), - done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } - - bool is_master() { return slave_to_mds < 0; } - bool is_slave() { return slave_to_mds >= 0; } - - bool slave_did_prepare() { return slave_commit; } - - // pin items in cache - void pin(MDSCacheObject *o) { - if (pins.count(o) == 0) { - o->get(MDSCacheObject::PIN_REQUEST); - pins.insert(o); - } - } - - // auth pins - bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object); - } - void auth_pin(MDSCacheObject *object) { - if (!is_auth_pinned(object)) { - object->auth_pin(); - auth_pins.insert(object); - } - } - void drop_local_auth_pins() { - set::iterator it = auth_pins.begin(); - while (it != auth_pins.end()) { - if ((*it)->is_auth()) { - (*it)->auth_unpin(); - auth_pins.erase(it++); - } else { - it++; - } - } - auth_pins.clear(); - } -}; - -inline ostream& operator<<(ostream& out, MDRequest &mdr) -{ - out << "request(" << mdr.reqid; - //if (mdr.request) out << " " << *mdr.request; - if (mdr.is_slave()) out << " slave_to mds" << mdr.slave_to_mds; - if (mdr.client_request) out << " cr=" << mdr.client_request; - if (mdr.slave_request) out << " sr=" << mdr.slave_request; - out << ")"; - return out; -} - -class MDCache { - public: - // my master - MDS *mds; - - LRU lru; // dentry lru for expiring items from cache - - protected: - // the cache - CInode *root; // root inode - hash_map inode_map; // map of inodes by ino - CInode *stray; // my stray dir - - // root - list waiting_for_root; - map > waiting_for_stray; - -public: - int get_num_inodes() { return inode_map.size(); } - int get_num_dentries() { return lru.lru_get_size(); } - - - // -- subtrees -- -protected: - map > subtrees; // nested bounds on subtrees. - - // adjust subtree auth specification - // dir->dir_auth - // imports/exports/nested_exports - // join/split subtrees as appropriate -public: - bool is_subtrees() { return !subtrees.empty(); } - void list_subtrees(list& ls); - void adjust_subtree_auth(CDir *root, pair auth); - void adjust_subtree_auth(CDir *root, int a, int b=CDIR_AUTH_UNKNOWN) { - adjust_subtree_auth(root, pair(a,b)); - } - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_export_state(CDir *dir); - void try_subtree_merge(CDir *root); - void try_subtree_merge_at(CDir *root); - void eval_subtree_root(CDir *dir); - CDir *get_subtree_root(CDir *dir); - void remove_subtree(CDir *dir); - void get_subtree_bounds(CDir *root, set& bounds); - void get_wouldbe_subtree_bounds(CDir *root, set& bounds); - void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& bounds); - - void adjust_subtree_after_rename(CInode *diri, CDir *olddir); - - void get_auth_subtrees(set& s); - void get_fullauth_subtrees(set& s); - - int num_subtrees(); - int num_subtrees_fullauth(); - int num_subtrees_fullnonauth(); - - -protected: - // delayed cache expire - map > delayed_expire; // subtree root -> expire msg - - // -- discover -- - hash_map > dir_discovers; // dirino -> mds set i'm trying to discover. - - - // -- requests -- -public: - - -protected: - hash_map active_requests; - -public: - MDRequest* request_start(MClientRequest *req); - MDRequest* request_start_slave(metareqid_t rid, int by); - bool have_request(metareqid_t rid) { - return active_requests.count(rid); - } - MDRequest* request_get(metareqid_t rid); - void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); - void request_finish(MDRequest *mdr); - void request_forward(MDRequest *mdr, int mds, int port=0); - void dispatch_request(MDRequest *mdr); - void request_forget_foreign_locks(MDRequest *mdr); - void request_cleanup(MDRequest *r); - - - // inode purging - map > purging; - map > > waiting_for_purge; - - // shutdown crap - int shutdown_commits; - bool did_shutdown_log_cap; - friend class C_MDC_ShutdownCommit; - - // -- recovery -- -protected: - set recovery_set; - -public: - void set_recovery_set(set& s); - void handle_mds_failure(int who); - void handle_mds_recovery(int who); - -protected: - // [resolve] - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSResolves - map > > other_ambiguous_imports; - - map > uncommitted_slave_updates; // for replay. - map ambiguous_slave_updates; // for log trimming. - map waiting_for_slave_update_commit; - friend class ESlaveUpdate; - - set wants_resolve; // nodes i need to send my resolve to - set got_resolve; // nodes i got resolves from - set need_resolve_ack; // nodes i need a resolve_ack from - - void handle_resolve(MMDSResolve *m); - void handle_resolve_ack(MMDSResolveAck *m); - void maybe_resolve_finish(); - void disambiguate_imports(); - void recalc_auth_bits(); -public: - // ambiguous imports - void add_ambiguous_import(dirfrag_t base, list& bounds); - void add_ambiguous_import(CDir *base, const set& bounds); - bool have_ambiguous_import(dirfrag_t base) { - return my_ambiguous_imports.count(base); - } - void cancel_ambiguous_import(dirfrag_t dirino); - void finish_ambiguous_import(dirfrag_t dirino); - void send_resolve(int who); - void send_resolve_now(int who); - void send_resolve_later(int who); - void maybe_send_pending_resolves(); - void log_subtree_map(Context *onsync=0); - void _logged_subtree_map(off_t off); - -protected: - // [rejoin] - set rejoin_gather; // nodes from whom i need a rejoin - set rejoin_ack_gather; // nodes from whom i need a rejoin ack - - map > cap_exports; // ino -> client -> capex - map cap_export_paths; - - map > > cap_imports; // ino -> client -> frommds -> capex - map cap_import_paths; - - set rejoin_undef_inodes; - - void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_weak(MMDSCacheRejoin *m); - CInode* rejoin_invent_inode(inodeno_t ino); - void handle_cache_rejoin_strong(MMDSCacheRejoin *m); - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack); - void handle_cache_rejoin_ack(MMDSCacheRejoin *m); - void handle_cache_rejoin_purge(MMDSCacheRejoin *m); - void handle_cache_rejoin_missing(MMDSCacheRejoin *m); - void handle_cache_rejoin_full(MMDSCacheRejoin *m); - void rejoin_send_acks(); - void rejoin_trim_undef_inodes(); -public: - void rejoin_gather_finish(); - void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr) { - cap_exports[ino][client] = icr; - cap_export_paths[ino] = path; - } - void rejoin_recovered_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr, - int frommds=-1) { - cap_imports[ino][client][frommds] = icr; - cap_import_paths[ino] = path; - } - void rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds); - - - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - CInode *get_stray() { return stray; } - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - - // trimming - bool trim(int max = -1); // trim cache - void trim_dentry(CDentry *dn, map& expiremap); - void trim_dirfrag(CDir *dir, CDir *con, - map& expiremap); - void trim_inode(CDentry *dn, CInode *in, CDir *con, - map& expiremap); - void send_expire_messages(map& expiremap); - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown(); // clear cache (ie at shutodwn) - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ino]; - return NULL; - } - CDir* get_dir(inodeno_t dirino) { // deprecated - return get_dirfrag(dirfrag_t(dirino, frag_t())); - } - CDir* get_dirfrag(dirfrag_t df) { - if (!have_inode(df.ino)) return NULL; - return inode_map[df.ino]->get_dirfrag(df.frag); - } - - MDSCacheObject *get_object(MDSCacheObjectInfo &info); - - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - void remove_inode(CInode *in); - protected: - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - void dentry_remove_replica(CDentry *dn, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(inode_t *inode, off_t newsize); - void _do_purge_inode(inode_t *inode, off_t newsize); - void purge_inode_finish(inodeno_t ino, off_t newsize); - void purge_inode_finish_2(inodeno_t ino, off_t newsize); - bool is_purging(inodeno_t ino, off_t newsize) { - return purging.count(ino) && purging[ino].count(newsize); - } - void wait_for_purge(inodeno_t ino, off_t newsize, Context *c) { - waiting_for_purge[ino][newsize].push_back(c); - } - - void add_recovered_purge(const inode_t& inode, off_t newsize); - void remove_recovered_purge(inodeno_t ino, off_t newsize); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - public: - CInode *create_root_inode(); - void open_root(Context *c); - CInode *create_stray_inode(int whose=-1); - void open_local_stray(); - void open_foreign_stray(int who, Context *c); - CDentry *get_or_create_stray_dentry(CInode *in); - - Context *_get_waiter(MDRequest *mdr, Message *req); - int path_traverse(MDRequest *mdr, Message *req, - CInode *base, filepath& path, - vector& trace, bool follow_trailing_sym, - int onfail); - bool path_is_mine(filepath& path); - bool path_is_mine(string& p) { - filepath path(p); - return path_is_mine(path); - } - CDir *path_traverse_to_dir(filepath& path); - - void open_remote_dir(CInode *diri, frag_t fg, Context *fin); - CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); - void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); - void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, - vector& anchortrace, - Context *onfinish); - - bool parallel_fetch(map& pathmap, - Context *c); - - void make_trace(vector& trace, CInode *in); - - // -- anchors -- -public: - void anchor_create(MDRequest *mdr, CInode *in, Context *onfinish); - void anchor_destroy(CInode *in, Context *onfinish); -protected: - void _anchor_create_prepared(CInode *in, version_t atid); - void _anchor_create_logged(CInode *in, version_t atid, version_t pdv); - void _anchor_destroy_prepared(CInode *in, version_t atid); - void _anchor_destroy_logged(CInode *in, version_t atid, version_t pdv); - - friend class C_MDC_AnchorCreatePrepared; - friend class C_MDC_AnchorCreateLogged; - friend class C_MDC_AnchorDestroyPrepared; - friend class C_MDC_AnchorDestroyLogged; - - // -- stray -- -public: - void eval_stray(CDentry *dn); -protected: - void _purge_stray(CDentry *dn); - void _purge_stray_logged(CDentry *dn, version_t pdv); - friend class C_MDC_PurgeStray; - void reintegrate_stray(CDentry *dn, CDentry *rlink); - void migrate_stray(CDentry *dn, int dest); - - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - CDir* add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover& dis, - int from, - list& finished); - CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); - - CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); - CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn); - -public: - CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from); -protected: - - - - // -- namespace -- - void handle_dentry_unlink(MDentryUnlink *m); - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - // -- cache expiration -- - void handle_cache_expire(MCacheExpire *m); - void process_delayed_expire(CDir *dir); - void discard_delayed_expire(CDir *dir); - - - // == crap fns == - public: - void show_cache(); - void dump_cache(); - void show_subtrees(int dbl=10); - - CInode *hack_pick_random_inode() { - assert(!inode_map.empty()); - int n = rand() % inode_map.size(); - hash_map::iterator p = inode_map.begin(); - while (n--) p++; - return p->second; - } - -}; - -class C_MDS_RetryRequest : public Context { - MDCache *cache; - MDRequest *mdr; - public: - C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} - virtual void finish(int r) { - cache->dispatch_request(mdr); - } -}; - -#endif diff --git a/branches/sage/pgs/mds/MDLog.cc b/branches/sage/pgs/mds/MDLog.cc deleted file mode 100644 index f16047612fc7b..0000000000000 --- a/branches/sage/pgs/mds/MDLog.cc +++ /dev/null @@ -1,476 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - -/* -MDLog::MDLog(MDS *m) : replay_thread(this) -{ - mds = m; - num_events = 0; - waiting_for_read = false; - - last_import_map = 0; - writing_import_map = false; - seen_import_map = false; - - max_events = g_conf.mds_log_max_len; - - capped = false; - - unflushed = 0; - - journaler = 0; - logger = 0; -} -*/ - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::init_journaler() -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype); - - static bool didit = false; - if (!didit) { - mdlog_logtype.add_inc("add"); - mdlog_logtype.add_inc("expire"); - mdlog_logtype.add_inc("obs"); - mdlog_logtype.add_inc("trim"); - mdlog_logtype.add_set("size"); - mdlog_logtype.add_set("read"); - mdlog_logtype.add_set("append"); - mdlog_logtype.add_inc("lsum"); - mdlog_logtype.add_inc("lnum"); - } - - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) { - log_inode.layout.preferred = mds->get_nodeid() + 10000; // hack - } - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger); -} - -void MDLog::flush_logger() -{ - if (logger) - logger->flush(true); -} - - - -void MDLog::reset() -{ - dout(5) << "reset to empty log" << endl; - init_journaler(); - journaler->reset(); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << endl; - init_journaler(); - journaler->recover(c); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::submit_entry( LogEvent *le, Context *c ) -{ - if (g_conf.mds_log) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; - - // encode it, with event type - bufferlist bl; - bl.append((char*)&le->_type, sizeof(le->_type)); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); - - assert(!capped); - - delete le; - num_events++; - - logger->inc("add"); - logger->set("size", num_events); - logger->set("append", journaler->get_write_pos()); - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // should we log a new import_map? - // FIXME: should this go elsewhere? - if (last_subtree_map && !writing_subtree_map && - journaler->get_write_pos() - last_subtree_map >= g_conf.mds_log_subtree_map_interval) { - // log import map - mds->mdcache->log_subtree_map(); - } - - } else { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(NULL); -} - - - - -// trim - -class C_MDL_Trimmed : public Context { -public: - MDLog *mdl; - LogEvent *le; - - C_MDL_Trimmed(MDLog *mdl, LogEvent *le) { - this->mdl = mdl; - this->le = le; - } - void finish(int res) { - mdl->_trimmed(le); - } -}; - -class C_MDL_Reading : public Context { -public: - MDLog *mdl; - C_MDL_Reading(MDLog *m) { - mdl = m; - } - void finish(int res) { - mdl->_did_read(); - } -}; - - -void MDLog::_did_read() -{ - dout(5) << "_did_read()" << endl; - waiting_for_read = false; - trim(0); -} - -void MDLog::_trimmed(LogEvent *le) -{ - // successful trim? - if (!le->has_expired(mds)) { - dout(7) << "retrimming : " << le->get_start_off() << " : " << *le << endl; - le->expire(mds, new C_MDL_Trimmed(this, le)); - return; - } - - dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl; - - if (trimming.begin()->first == le->_end_off) { - // we trimmed off the front! - // we can expire the log a bit. - journaler->set_expire_pos(le->_end_off); - journaler->trim(); - } - - trimming.erase(le->_end_off); - delete le; - - logger->set("trim", trimming.size()); - logger->set("read", journaler->get_read_pos()); - - trim(0); -} - - - -void MDLog::trim(Context *c) -{ - // add waiter - if (c) - trim_waiters.push_back(c); - - // trim! - dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl; - - // hack: only trim for a few seconds at a time - utime_t stop = g_clock.now(); - stop += 2.0; - - while (num_events > max_events && - stop > g_clock.now()) { - - off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); - dout(5) << "trim num_events " << num_events << " > max " << max_events - << ", trimming " << trimming.size() - << ", byte gap " << gap - << endl; - - if ((int)trimming.size() >= g_conf.mds_log_max_trimming) { - dout(7) << "trim already trimming max, waiting" << endl; - return; - } - - bufferlist bl; - off_t so = journaler->get_read_pos(); - if (journaler->try_read_entry(bl)) { - // decode logevent - LogEvent *le = LogEvent::decode(bl); - le->_start_off = so; - le->_end_off = journaler->get_read_pos(); - num_events--; - - // we just read an event. - if (le->has_expired(mds)) { - // obsolete - dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << endl; - delete le; - logger->inc("obs"); - } else { - assert ((int)trimming.size() < g_conf.mds_log_max_trimming); - - // trim! - dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << endl; - trimming[le->_end_off] = le; - le->expire(mds, new C_MDL_Trimmed(this, le)); - logger->inc("expire"); - logger->set("trim", trimming.size()); - } - logger->set("read", journaler->get_read_pos()); - logger->set("size", num_events); - } else { - // need to read! - if (!waiting_for_read) { - waiting_for_read = true; - dout(7) << "trim waiting for read" << endl; - journaler->wait_for_readable(new C_MDL_Reading(this)); - } else { - dout(7) << "trim already waiting for read" << endl; - } - return; - } - } - - dout(5) << "trim num_events " << num_events << " <= max " << max_events - << ", trimming " << trimming.size() - << ", done for now." - << endl; - - // trimmed! - std::list finished; - finished.swap(trim_waiters); - finish_contexts(finished, 0); - - // hmm, are we at the end? - /* - if (journaler->get_read_pos() == journaler->get_write_pos() && - trimming.size() == import_map_expire_waiters.size()) { - dout(5) << "trim log is empty, allowing import_map to expire" << endl; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - */ -} - - - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << endl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << endl; - - assert(num_events == 0); - - replay_thread.create(); - //_replay(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { - mdlog->replay_cond.Signal(); - //mdlog->_replay(); - } -}; - - - -// i am a separate thread -void MDLog::_replay_thread() -{ - mds->mds_lock.Lock(); - dout(10) << "_replay_thread start" << endl; - - // loop - while (1) { - // wait for read? - while (!journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - replay_cond.Wait(mds->mds_lock); - } - - if (!journaler->is_readable() && - journaler->get_read_pos() == journaler->get_write_pos()) - break; - - assert(journaler->is_readable()); - - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - num_events++; - - // have we seen an import map yet? - if (!seen_subtree_map && - le->get_type() != EVENT_SUBTREEMAP) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for subtree_map. (skipping " << *le << ")" << endl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << endl; - le->replay(mds); - - if (le->get_type() == EVENT_SUBTREEMAP) - seen_subtree_map = true; - } - delete le; - - // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off - mds->mds_lock.Unlock(); - mds->mds_lock.Lock(); - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete" << endl; - - // move read pointer _back_ to expire pos, for eventual trimming - journaler->set_read_pos(journaler->get_expire_pos()); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); - - dout(10) << "_replay_thread finish" << endl; - mds->mds_lock.Unlock(); -} - - - diff --git a/branches/sage/pgs/mds/MDLog.h b/branches/sage/pgs/mds/MDLog.h deleted file mode 100644 index 75464df26e304..0000000000000 --- a/branches/sage/pgs/mds/MDLog.h +++ /dev/null @@ -1,172 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include "common/Thread.h" -#include "common/Cond.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; - -class Logger; - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const LogEvent *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} -*/ - -class MDLog { - protected: - MDS *mds; - size_t num_events; // in events - size_t max_events; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - Logger *logger; - - // -- trimming -- - map trimming; - std::list trim_waiters; // contexts waiting for trim - bool trim_reading; - - bool waiting_for_read; - friend class C_MDL_Reading; - - - - // -- replay -- - Cond replay_cond; - - class ReplayThread : public Thread { - MDLog *log; - public: - ReplayThread(MDLog *l) : log(l) {} - void* entry() { - log->_replay_thread(); - return 0; - } - } replay_thread; - - friend class ReplayThread; - friend class C_MDL_Replay; - - list waitfor_replay; - - void _replay(); // old way - void _replay_thread(); // new way - - - - // -- subtreemaps -- - off_t last_subtree_map; // offsets of last committed subtreemap. constrains trimming. - list subtree_map_expire_waiters; - bool writing_subtree_map; // one is being written now - bool seen_subtree_map; // for recovery - - friend class C_MDS_WroteImportMap; - friend class MDCache; - - void init_journaler(); - public: - off_t get_last_subtree_map_offset() { return last_subtree_map; } - void add_subtree_map_expire_waiter(Context *c) { - subtree_map_expire_waiters.push_back(c); - } - void take_subtree_map_expire_waiters(list& ls) { - ls.splice(ls.end(), subtree_map_expire_waiters); - } - - - - // replay state - map > pending_exports; - - - - public: - MDLog(MDS *m) : mds(m), - num_events(0), max_events(g_conf.mds_log_max_len), - unflushed(0), - capped(false), - journaler(0), - logger(0), - trim_reading(false), waiting_for_read(false), - replay_thread(this), - last_subtree_map(0), - writing_subtree_map(false), seen_subtree_map(false) { - } - ~MDLog(); - - - void flush_logger(); - - void set_max_events(size_t max) { max_events = max; } - size_t get_max_events() { return max_events; } - size_t get_num_events() { return num_events + trimming.size(); } - size_t get_non_subtreemap_events() { return num_events + trimming.size() - subtree_map_expire_waiters.size(); } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { - return get_read_pos() == get_write_pos(); - } - - bool is_capped() { return capped; } - void cap() { - capped = true; - list ls; - ls.swap(subtree_map_expire_waiters); - finish_contexts(ls); - } - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - - void trim(Context *c); - void _did_read(); - void _trimmed(LogEvent *le); - - void reset(); // fresh, empty log! - void open(Context *onopen); - void write_head(Context *onfinish); - - void replay(Context *onfinish); -}; - -#endif diff --git a/branches/sage/pgs/mds/MDS.cc b/branches/sage/pgs/mds/MDS.cc deleted file mode 100644 index df0f0b88fa092..0000000000000 --- a/branches/sage/pgs/mds/MDS.cc +++ /dev/null @@ -1,1320 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -//#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "events/ESession.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" - - -LogType mds_logtype, mds_cache_logtype; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : - timer(mds_lock), - clientmap(this) { - this->whoami = whoami; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(this); - idalloc = new IdAllocator(this); - - anchortable = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - - // clients - last_client_mdsmap_bcast = 0; - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchortable) { delete anchortable; anchortable = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger() -{ - // flush+close old log - if (logger) { - logger->flush(true); - delete logger; - } - if (logger2) { - logger2->flush(true); - delete logger2; - } - - - // log - string name; - name = "mds"; - int w = whoami; - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mds_logtype); - - mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - mds_logtype.add_inc("cfw"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_inc("lih"); - mds_logtype.add_inc("lif"); - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - mds_logtype.add_inc("cmiss"); - - mds_logtype.add_set("buf"); - mds_logtype.add_inc("cdir"); - mds_logtype.add_inc("fdir"); - - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("iim"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype); -} - -void MDS::send_message_mds(Message *m, int mds, int port, int fromport) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - if (port && !fromport) - fromport = port; - messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); -} - -void MDS::forward_message_mds(Message *req, int mds, int port) -{ - // client request? - if (req->get_type() == MSG_CLIENT_REQUEST) { - MClientRequest *creq = (MClientRequest*)req; - creq->inc_num_fwd(); // inc forward counter - - // tell the client where it should go - messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd()), - creq->get_client_inst()); - - if (!creq->is_idempotent()) { - delete req; - return; // don't actually forward if non-idempotent! client has to do it. - } - } - - // forward - send_message_mds(req, mds, port); -} - - - -void MDS::send_message_client(Message *m, int client) -{ - version_t seq = clientmap.inc_push_seq(client); - dout(10) << "send_message_client client" << client << " seq " << seq << " " << *m << endl; - messenger->send_message(m, clientmap.get_inst(client)); -} - -void MDS::send_message_client(Message *m, entity_inst_t clientinst) -{ - version_t seq = clientmap.inc_push_seq(clientinst.name.num()); - dout(10) << "send_message_client client" << clientinst.name.num() << " seq " << seq << " " << *m << endl; - messenger->send_message(m, clientinst); -} - - -class C_MDS_SendMessageClientSession : public Context { - MDS *mds; - Message *msg; - entity_inst_t clientinst; -public: - C_MDS_SendMessageClientSession(MDS *md, Message *ms, entity_inst_t& ci) : - mds(md), msg(ms), clientinst(ci) {} - void finish(int r) { - mds->clientmap.open_session(clientinst); - mds->send_message_client(msg, clientinst.name.num()); - } -}; - -void MDS::send_message_client_maybe_open(Message *m, entity_inst_t clientinst) -{ - int client = clientinst.name.num(); - if (!clientmap.have_session(client)) { - // no session! - dout(10) << "send_message_client opening session with " << clientinst << endl; - clientmap.add_opening(client); - mdlog->submit_entry(new ESession(clientinst, true, clientmap.inc_projected()), - new C_MDS_SendMessageClientSession(this, m, clientinst)); - } else { - // we have a session. - send_message_client(m, clientinst); - } -} - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - want_state = MDSMap::STATE_BOOT; - - // starting beacon. this will induce an MDSMap from the monitor - beacon_start(); - - // schedule tick - reset_tick(); - - // init logger - reopen_logger(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - tick_event = 0; - - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->set("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - - mdcache->log_stat(logger); - } - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - // HACK to test hashing stuff - if (false) { - /* - static map didhash; - if (elapsed.sec() > 15 && !didhash[whoami]) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth()) - mdcache->migrator->hash_dir(in->dir); - didhash[whoami] = 1; - } - } - if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth() && in->dir->is_hashed()) - mdcache->migrator->unhash_dir(in->dir); - didhash[whoami] = 2; - } - } - */ - } - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << endl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(messenger->get_myinst(), want_state, beacon_last_seq), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << endl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << endl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << endl; - messenger->suicide(); - //exit(0); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << endl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t hadepoch = mdsmap->get_epoch(); - version_t epoch = m->get_epoch(); - dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << endl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << endl; - delete m; - return; - } - - // note some old state - int oldwhoami = whoami; - int oldstate = state; - set oldresolve; - mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - bool wasrejoining = mdsmap->is_rejoining(); - set oldfailed; - mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - set oldactive; - mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - set oldcreating; - mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING); - set oldstopped; - mdsmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); - - // decode and process - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); - if (oldwhoami != whoami) { - // update messenger. - messenger->reset_myname(MSG_ADDR_MDS(whoami)); - - reopen_logger(); - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << endl; - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // for debug - if (g_conf.mds_dump_cache_on_map) - mdcache->dump_cache(); - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << endl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - << ", although i wanted " << mdsmap->get_state_name(want_state) - << endl; - want_state = state; - } - - // contemplate suicide - if (mdsmap->get_inst(whoami) != messenger->get_myinst()) { - dout(1) << "apparently i've been replaced by " << mdsmap->get_inst(whoami) << ", committing suicide." << endl; - messenger->suicide(); - return; - } - if (mdsmap->is_down(whoami)) { - dout(1) << "apparently i'm down, committing suicide." << endl; - messenger->suicide(); - return; - } - - // now active? - if (is_active()) { - // did i just recover? - if (oldstate == MDSMap::STATE_REJOIN || - oldstate == MDSMap::STATE_RECONNECT) - recovery_done(); - - dout(1) << "now active" << endl; - finish_contexts(waiting_for_active); // kick waiters - } else if (is_replay()) { - replay_start(); - } else if (is_resolve()) { - resolve_start(); - } else if (is_reconnect()) { - reconnect_start(); - } else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - stopping_start(); - } else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - dout(1) << "now stopped, sending down:out and exiting" << endl; - shutdown_final(); - } - } - - - // RESOLVE - // is someone else newly resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set resolve; - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) { - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p)) continue; - mdcache->send_resolve(*p); // now or later. - } - } - } - - // REJOIN - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - // did we start? - if (!wasrejoining && mdsmap->is_rejoining()) - rejoin_joint_start(); - - // did we finish? - if (g_conf.mds_dump_cache_after_rejoin && - wasrejoining && !mdsmap->is_rejoining()) - mdcache->dump_cache(); // for DEBUG only - } - - // did someone go active? - if (is_active() || is_stopping()) { - set active; - mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) { - if (*p == whoami) continue; // not me - if (oldactive.count(*p)) continue; // newly so? - handle_mds_recovery(*p); - } - } - - if (is_active() || is_stopping()) { - // did anyone go down? - set failed; - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) { - if (oldfailed.count(*p)) continue; // newly so? - mdcache->handle_mds_failure(*p); - } - - // did anyone stop? - set stopped; - mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) { - if (oldstopped.count(*p)) continue; // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } - } - - - // inst set changed? - /* - if (state >= MDSMap::STATE_ACTIVE && // only if i'm active+. otherwise they'll get map during reconnect. - mdsmap->get_same_inst_since() > last_client_mdsmap_bcast) { - bcast_mds_map(); - } - */ - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - mdsmap->get_epoch() > 0 && - osdmap->get_epoch() > 0) - boot(); - - delete m; -} - -void MDS::bcast_mds_map() -{ - dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << endl; - - // share the map with mounted clients - for (set::const_iterator p = clientmap.get_session_set().begin(); - p != clientmap.get_session_set().end(); - ++p) { - messenger->send_message(new MMDSMap(mdsmap), - clientmap.get_inst(*p)); - } - last_client_mdsmap_bcast = mdsmap->get_epoch(); -} - - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t hadepoch = osdmap->get_epoch(); - dout(10) << "handle_osd_map had " << hadepoch << endl; - - // process - objecter->handle_osd_map(m); - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot(); -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << endl; - want_state = s; - beacon_send(); -} - -void MDS::boot() -{ - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting()) - boot_start(); // old tables, empty journal - else if (is_replay()) - boot_replay(); // replay, join - else - assert(is_standby()); -} - - -class C_MDS_BootFinish : public Context { - MDS *mds; -public: - C_MDS_BootFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->boot_finish(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << endl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - CDir *dir = root->get_dirfrag(frag_t()); - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - - // save it - dir->commit(0, fin->new_sub()); - } - - // create my stray dir - { - dout(10) << "boot_create creating local stray dir" << endl; - mdcache->open_local_stray(); - CInode *stray = mdcache->get_stray(); - CDir *dir = stray->get_dirfrag(frag_t()); - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - dir->commit(0, fin->new_sub()); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << endl; - mdlog->reset(); - mdlog->write_head(fin->new_sub()); - - // write our first subtreemap - mdcache->log_subtree_map(fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << endl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // write empty clientmap - clientmap.save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << endl; - anchortable->create_fresh(); - anchortable->save(fin->new_sub()); - } -} - -void MDS::boot_start() -{ - dout(2) << "boot_start" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - dout(2) << "boot_start opening idalloc" << endl; - idalloc->load(fin->new_sub()); - - dout(2) << "boot_start opening clientmap" << endl; - clientmap.load(fin->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start opening anchor table" << endl; - anchortable->load(fin->new_sub()); - } else { - dout(2) << "boot_start i have no anchor table" << endl; - } - - dout(2) << "boot_start opening mds log" << endl; - mdlog->open(fin->new_sub()); - - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start opening root directory" << endl; - mdcache->open_root(fin->new_sub()); - } - - dout(2) << "boot_start opening local stray directory" << endl; - mdcache->open_local_stray(); -} - -void MDS::boot_finish() -{ - dout(3) << "boot_finish" << endl; - - if (is_starting()) { - // make sure mdslog is empty - assert(mdlog->get_read_pos() == mdlog->get_write_pos()); - } - - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootRecover : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_replay(nextstep); } -}; - -void MDS::boot_replay(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - { - C_Gather *gather = new C_Gather(new C_MDS_BootRecover(this, 2)); - dout(2) << "boot_replay " << step << ": opening idalloc" << endl; - idalloc->load(gather->new_sub()); - - dout(2) << "boot_replay " << step << ": opening clientmap" << endl; - clientmap.load(gather->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_replay " << step << ": opening anchor table" << endl; - anchortable->load(gather->new_sub()); - } - } - break; - - case 2: - dout(2) << "boot_replay " << step << ": opening mds log" << endl; - mdlog->open(new C_MDS_BootRecover(this, 3)); - break; - - case 3: - dout(2) << "boot_replay " << step << ": replaying mds log" << endl; - mdlog->replay(new C_MDS_BootRecover(this, 4)); - break; - - case 4: - replay_done(); - break; - - } -} - - -void MDS::replay_start() -{ - dout(1) << "replay_start" << endl; - - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << endl; - mdcache->set_recovery_set(rs); - - // note: don't actually start yet. boot() will get called once we have - // an mdsmap AND osdmap. -} - -void MDS::replay_done() -{ - dout(1) << "replay_done" << endl; - - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - dout(2) << "i am alone, moving to state reconnect" << endl; - set_want_state(MDSMap::STATE_RECONNECT); - } else { - dout(2) << "i am not alone, moving to state resolve" << endl; - set_want_state(MDSMap::STATE_RESOLVE); - } -} - - -void MDS::resolve_start() -{ - dout(1) << "resolve_start" << endl; - - set who; - mdsmap->get_mds_set(who, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(who, MDSMap::STATE_REJOIN); - mdsmap->get_mds_set(who, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(who, MDSMap::STATE_STOPPING); - for (set::iterator p = who.begin(); p != who.end(); ++p) { - if (*p == whoami) continue; - mdcache->send_resolve(*p); // now. - } -} -void MDS::resolve_done() -{ - dout(1) << "resolve_done" << endl; - set_want_state(MDSMap::STATE_RECONNECT); -} - -void MDS::reconnect_start() -{ - dout(1) << "reconnect_start" << endl; - server->reconnect_clients(); -} -void MDS::reconnect_done() -{ - dout(1) << "reconnect_done" << endl; - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - - /* - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - - // finish processing caps (normally, this happens during rejoin, but we're skipping that...) - mdcache->rejoin_gather_finish(); - - set_want_state(MDSMap::STATE_ACTIVE); // go active - } else { - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - } - */ -} - -void MDS::rejoin_joint_start() -{ - dout(1) << "rejoin_joint_start" << endl; - mdcache->rejoin_send_rejoins(); -} -void MDS::rejoin_done() -{ - dout(1) << "rejoin_done" << endl; - mdcache->show_subtrees(); - mdcache->show_cache(); - set_want_state(MDSMap::STATE_ACTIVE); -} - - -void MDS::recovery_done() -{ - dout(1) << "recovery_done -- successful recovery!" << endl; - assert(is_active()); - - // kick anchortable (resent AGREEs) - if (mdsmap->get_anchortable() == whoami) - anchortable->finish_recovery(); - - // kick anchorclient (resent COMMITs) - anchorclient->finish_recovery(); - - mdcache->start_recovered_purges(); - - // tell connected clients - bcast_mds_map(); -} - -void MDS::handle_mds_recovery(int who) -{ - dout(5) << "handle_mds_recovery mds" << who << endl; - - mdcache->handle_mds_recovery(who); - - if (anchortable) - anchortable->handle_mds_recovery(who); - anchorclient->handle_mds_recovery(who); - - queue_waiters(waiting_for_active_peer[who]); - waiting_for_active_peer.erase(who); -} - - -void MDS::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - derr(0) << "mds shutdown start" << endl; - - // tell everyone to stop. - set active; - mdsmap->get_in_mds_set(active); - for (set::iterator p = active.begin(); - p != active.end(); - p++) { - if (mdsmap->is_up(*p)) { - dout(1) << "sending MShutdownStart to mds" << *p << endl; - send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), - *p, MDS_PORT_MAIN); - } - } - - // go - set_want_state(MDSMap::STATE_STOPPING); -} - -void MDS::handle_shutdown_start(Message *m) -{ - dout(1) << " handle_shutdown_start" << endl; - - set_want_state(MDSMap::STATE_STOPPING); - delete m; -} - - - -void MDS::stopping_start() -{ - dout(1) << "stopping_start" << endl; - - // start cache shutdown - mdcache->shutdown_start(); - - // terminate client sessions - server->terminate_sessions(); - - // flush log - mdlog->set_max_events(0); - mdlog->trim(NULL); -} -void MDS::stopping_done() -{ - dout(1) << "stopping_done" << endl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); -} - - - -int MDS::shutdown_final() -{ - dout(1) << "shutdown_final" << endl; - - // flush loggers - if (logger) logger->flush(true); - if (logger2) logger2->flush(true); - mdlog->flush_logger(); - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - timer.join(); - - // shut down cache - mdcache->shutdown(); - - // shut down messenger - messenger->shutdown(); - - return 0; -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst() || - mdsmap->is_down(from)) { - // bogus mds? - if (m->get_type() != MSG_MDS_MAP) { - dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() - << ", dropping" << endl; - delete m; - return; - } else { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << endl; - } - } - } - - - switch (m->get_dest_port()) { - - case MDS_PORT_ANCHORTABLE: - anchortable->dispatch(m); - break; - case MDS_PORT_ANCHORCLIENT: - anchorclient->dispatch(m); - break; - - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - case MDS_PORT_RENAMER: - //mdcache->renamer->dispatch(m); - break; - - case MDS_PORT_BALANCER: - balancer->proc_message(m); - break; - - case MDS_PORT_MAIN: - proc_message(m); - break; - - case MDS_PORT_SERVER: - server->dispatch(m); - break; - - default: - dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl; - assert(0); - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; - dout(10) << finished_queue << endl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - // HACK FOR NOW - if (is_active()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - - // hack: thrash exports - for (int i=0; i s; - if (!is_active()) break; - mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); - if (s.size() < 2 || mdcache->get_num_inodes() < 10) - break; // need peers for this to work. - - dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << endl; - - // pick a random dir inode - CInode *in = mdcache->hack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - - int dest; - do { - int k = rand() % s.size(); - set::iterator p = s.begin(); - while (k--) p++; - dest = *p; - } while (dest == whoami); - mdcache->migrator->export_dir(dir,dest); - } - - - // hack: force hash root? - /* - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << endl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - */ - - - - // HACK to force export to test foreign renames - if (false && whoami == 0) { - /* - static bool didit = false; - - // 7 to 1 - CInode *in = mdcache->get_inode(1001); - if (in && in->is_dir() && !didit) { - CDir *dir = in->get_or_open_dir(mdcache); - if (dir->is_auth()) { - dout(1) << "FORCING EXPORT" << endl; - mdcache->migrator->export_dir(dir,1); - didit = true; - } - } - */ - } - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << endl; - stopping_done(); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - // OSD =============== - /* - case MSG_OSD_MKFS_ACK: - handle_osd_mkfs_ack(m); - return; - */ - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ - handle_shutdown_start(m); - return; - - case MSG_PING: - handle_ping((MPing*)m); - return; - - default: - assert(0); - } - -} - - - -void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - mds_lock.Lock(); - dout(10) << "handle_ms_failure to " << inst << " on " << *m << endl; - - if (m->get_type() == MSG_CLIENT_RECONNECT) - server->client_reconnect_failure(m->get_dest().num()); - - delete m; - mds_lock.Unlock(); -} - - - - - -void MDS::handle_ping(MPing *m) -{ - dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl; - - messenger->send_message(new MPingAck(m), - m->get_source_inst()); - - delete m; -} - - - diff --git a/branches/sage/pgs/mds/MDS.h b/branches/sage/pgs/mds/MDS.h deleted file mode 100644 index ae8fb4d618d4e..0000000000000 --- a/branches/sage/pgs/mds/MDS.h +++ /dev/null @@ -1,296 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchortable; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - - list waiting_for_active; - map > waiting_for_active_peer; - - map peer_mdsmap_epoch; - - public: - void wait_for_active(Context *c) { - waiting_for_active.push_back(c); - } - void wait_for_active_peer(int who, Context *c) { - waiting_for_active_peer[who].push_back(c); - } - - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - - // -- waiters -- - list finished_queue; - - void queue_waiter(Context *c) { - finished_queue.push_back(c); - } - void queue_waiters(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - - class C_MDS_BeaconSender : public Context { - MDS *mds; - public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_sender = 0; - mds->beacon_send(); - } - } *beacon_sender; - class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; - public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - mds->beacon_killer = 0; - mds->beacon_kill(lab); - } - } *beacon_killer; - - // tick and other timer fun - class C_MDS_Tick : public Context { - MDS *mds; - public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick_event = 0; - mds->tick(); - } - } *tick_event; - void reset_tick(); - - // -- client map -- - ClientMap clientmap; - epoch_t last_client_mdsmap_bcast; - //void log_clientmap(Context *c); - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds, int port=0, int fromport=0); - void forward_message_mds(Message *req, int mds, int port=0); - - void send_message_client(Message *m, int client); - void send_message_client(Message *m, entity_inst_t clientinst); - void send_message_client_maybe_open(Message *m, entity_inst_t clientinst); - - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(); - - void bcast_mds_map(); // to mounted clients - - void boot(); - void boot_create(); // i am new mds. - void boot_start(); // i am old but empty (was down:out) mds. - void boot_replay(int step=0); // i am recovering existing (down:failed) mds. - void boot_finish(); - - void replay_start(); - void replay_done(); - void resolve_start(); - void resolve_done(); - void reconnect_start(); - void reconnect_done(); - void rejoin_joint_start(); - void rejoin_done(); - void recovery_done(); - void handle_mds_recovery(int who); - - void shutdown_start(); - void stopping_start(); - void stopping_done(); - int shutdown_final(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - void ms_handle_failure(Message *m, const entity_inst_t& inst); - - // special message types - void handle_ping(class MPing *m); - void handle_mds_map(class MMDSMap *m); - void handle_shutdown_start(Message *m); - - // osds - void handle_osd_getmap(Message *m); - void handle_osd_map(class MOSDMap *m); - -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/branches/sage/pgs/mds/MDSMap.h b/branches/sage/pgs/mds/MDSMap.h deleted file mode 100644 index d72e6a1f21cca..0000000000000 --- a/branches/sage/pgs/mds/MDSMap.h +++ /dev/null @@ -1,343 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - - -/* - - beautiful state diagram: - - STOPPED DNE FAILED - / | \ / | | - / | \________ _______/ | | -| v v v v v -| STARTING <--> STANDBY <--> CREATING REPLAY -> RECONNECT -> REJOIN -| \ / / -| \____ ____________/ / - \ v v / - \ ACTIVE <----------------------------------------/ - \ | - \ | - \ v - \-- STOPPING - - - - -*/ - - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_STOPPED = -1; // down, once existed, but no subtrees. empty log. - static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered. - - static const int STATE_BOOT = -3; // up, boot announcement. destiny unknown. - static const int STATE_STANDBY = -4; // up, idle. waiting for assignment by monitor. - static const int STATE_CREATING = -5; // up, creating MDS instance (new journal, idalloc..). - static const int STATE_STARTING = -6; // up, starting prior stopped MDS instance. - - static const int STATE_REPLAY = 7; // up, starting prior failed instance. scanning journal. - static const int STATE_RESOLVE = 8; // up, disambiguating distributed operations (import, rename, etc.) - static const int STATE_RECONNECT = 9; // up, reconnect to clients - static const int STATE_REJOIN = 10; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 11; // up, active - static const int STATE_STOPPING = 12; // up, exporting metadata (-> standby or out) - - static const char *get_state_name(int s) { - switch (s) { - // down and out - case STATE_DNE: return "down:dne"; - case STATE_STOPPED: return "down:stopped"; - // down and in - case STATE_FAILED: return "down:failed"; - // up and out - case STATE_BOOT: return "up:boot"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_STANDBY: return "up:standby"; - // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - utime_t created; - epoch_t same_inst_since; - - int target_num; - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory - - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), same_inst_since(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_create() const { return created; } - epoch_t get_same_inst_since() const { return same_inst_since; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() { - return get_num_in_mds(); - } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - - int get_num_in_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) s.insert(p->first); - } - void get_in_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_in(p->first)) s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - (p->second >= STATE_REPLAY && p->second <= STATE_STOPPING)) - s.insert(p->first); - } - - int get_random_in_mds() { - vector v; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) v.push_back(p->first); - if (v.empty()) - return -1; - else - return v[rand() % v.size()]; - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_stopped(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - bool is_in(int m) { return mds_state.count(m) && mds_state[m] > 0; } - bool is_out(int m) { return !mds_state.count(m) || mds_state[m] <= 0; } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; } - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_reconnect(int m) { return mds_state.count(m) && mds_state[m] == STATE_RECONNECT; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool has_created(int m) { return mds_created.count(m); } - - // cluster states - bool is_full() { - return get_num_in_mds() >= target_num; - } - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return - get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_RECONNECT) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - bool is_rejoining() { - // nodes are rejoining cache state - return - get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_RECONNECT) == 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - bool is_stopped() { - return - get_num_in_mds() == 0 && - get_num_mds(STATE_CREATING) == 0 && - get_num_mds(STATE_STARTING) == 0 && - get_num_mds(STATE_STANDBY) == 0; - } - - - int get_state(int m) { - if (mds_state.count(m)) - return mds_state[m]; - else - return STATE_DNE; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_addr_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - - return -1; - } - - int get_inc(int m) { - assert(mds_inc.count(m)); - return mds_inc[m]; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(target_num, bl); - ::_encode(created, bl); - ::_encode(same_inst_since, bl); - ::_encode(anchortable, bl); - ::_encode(root, bl); - ::_encode(mds_state, bl); - ::_encode(mds_state_seq, bl); - ::_encode(mds_inst, bl); - ::_encode(mds_inc, bl); - } - - void decode(bufferlist& bl) { - int off = 0; - ::_decode(epoch, bl, off); - ::_decode(target_num, bl, off); - ::_decode(created, bl, off); - ::_decode(same_inst_since, bl, off); - ::_decode(anchortable, bl, off); - ::_decode(root, bl, off); - ::_decode(mds_state, bl, off); - ::_decode(mds_state_seq, bl, off); - ::_decode(mds_inst, bl, off); - ::_decode(mds_inc, bl, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/branches/sage/pgs/mds/Migrator.cc b/branches/sage/pgs/mds/Migrator.cc deleted file mode 100644 index 09a3bcc205352..0000000000000 --- a/branches/sage/pgs/mds/Migrator.cc +++ /dev/null @@ -1,1988 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_finish((MExportDirFinish*)m); - break; - case MSG_MDS_EXPORTDIRCANCEL: - handle_export_cancel((MExportDirCancel*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRACK: - handle_export_ack((MExportDirAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (dir_auth adjustments) - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_notify((MExportDirNotify*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << endl; - - if (dir->inode->is_auth()) return; - if (!dir->is_auth()) return; - - if (dir->inode->is_freezing() || dir->inode->is_frozen()) return; - if (dir->is_freezing() || dir->is_frozen()) return; - - if (dir->get_size() > 0) { - dout(7) << "not actually empty" << endl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "root" << endl; - return; - } - - // is it really empty? - if (!dir->is_complete()) { - dout(7) << "not complete, fetching." << endl; - dir->fetch(new C_MDC_EmptyImport(this,dir)); - return; - } - - int dest = dir->inode->authority().first; - - // comment this out ot wreak havoc? - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << "really empty, exporting to " << dest << endl; - assert (dest != mds->get_nodeid()); - - dout(-7) << "exporting to mds" << dest - << " empty import " << *dir << endl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure_or_stop(int who) -{ - dout(5) << "handle_mds_failure_or_stop mds" << who << endl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - // abort exports: - // - that are going to the failed node - // - that aren't frozen yet (to avoid auth_pin deadlock) - if (export_peer[dir] == who || - p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { - // the guy i'm exporting to failed, or we're just freezing. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << endl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - case EXPORT_FREEZING: - dout(10) << "export state=freezing : canceling freeze" << endl; - dir->unfreeze_tree(); // cancel the freeze - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - // NOTE: state order reversal, warning comes after loggingstart+prepping - case EXPORT_WARNING: - dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << endl; - // fall-thru - - //case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - if (p->second != EXPORT_WARNING) - dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << endl; - // unpin bounds - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - dir->unfreeze_tree(); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); - break; - - case EXPORT_EXPORTING: - dout(10) << "export state=exporting : reversing, and unfreezing" << endl; - export_reverse(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); - break; - - case EXPORT_LOGGINGFINISH: - case EXPORT_NOTIFYING: - dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << endl; - // leave export_state, don't clean up now. - break; - - default: - assert(0); - } - - // finish clean-up? - if (export_state.count(dir) == 0) { - export_peer.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // wake up any waiters - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? (these need to go out when all exports have finished.) - cache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - } - } else { - // bystander failed. - if (export_warning_ack_waiting.count(dir) && - export_warning_ack_waiting[dir].count(who)) { - export_warning_ack_waiting[dir].erase(who); - export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. - if (p->second == EXPORT_WARNING) { - // exporter waiting for warning acks, let's fake theirs. - dout(10) << "faking export_warning_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << endl; - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); - } - } - if (export_notify_ack_waiting.count(dir) && - export_notify_ack_waiting[dir].count(who)) { - export_notify_ack_waiting[dir].erase(who); - if (p->second == EXPORT_NOTIFYING) { - // exporter is waiting for notify acks, fake it - dout(10) << "faking export_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << endl; - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - dirfrag_t df = q->first; - CInode *diri = mds->mdcache->get_inode(df.ino); - CDir *dir = mds->mdcache->get_dirfrag(df); - - if (import_peer[df] == who) { - switch (import_state[df]) { - case IMPORT_DISCOVERING: - dout(10) << "import state=discovering : clearing state" << endl; - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_DISCOVERED: - dout(10) << "import state=discovered : unpinning inode " << *diri << endl; - assert(diri); - // unpin base - diri->put(CInode::PIN_IMPORTING); - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_PREPPING: - if (import_state[df] == IMPORT_PREPPING) { - dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl; - } - assert(dir); - import_reverse_unpin(dir); // unpin - break; - - case IMPORT_PREPPED: - dout(10) << "import state=prepping : unpinning base+bounds, unfreezing " << *dir << endl; - assert(dir); - - // adjust auth back to me - cache->adjust_subtree_auth(dir, import_peer[df]); - cache->try_subtree_merge(dir); - - // bystanders? - if (import_bystanders[dir].empty()) { - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - import_notify_abort(dir); - import_state[df] = IMPORT_ABORTING; - } - break; - - case IMPORT_LOGGINGSTART: - dout(10) << "import state=loggingstart : reversing import on " << *dir << endl; - import_reverse(dir); - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - dout(10) << "import state=acking : noting ambiguous import " << *dir << endl; - cache->add_ambiguous_import(dir, import_bounds[dir]); - break; - - case IMPORT_ABORTING: - dout(10) << "import state=aborting : ignoring repeat failure " << *dir << endl; - break; - } - } - - // next! - q = next; - } -} - - - -void Migrator::show_importing() -{ - dout(10) << "show_importing" << endl; - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - if (dir) { - dout(10) << " importing to " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << " " << *dir - << endl; - } else { - dout(10) << " importing to " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << endl; - } - } -} - -void Migrator::show_exporting() -{ - dout(10) << "show_exporting" << endl; - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) - dout(10) << " exporting to " << export_peer[p->first] - << ": (" << p->second << ") " << get_export_statename(p->second) - << " " << p->first->dirfrag() - << " " << *p->first - << endl; -} - - - -void Migrator::audit() -{ - if (g_conf.debug_mds < 5) return; // hrm. - - // import_state - show_importing(); - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - if (p->second == IMPORT_DISCOVERING) - continue; - if (p->second == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(p->first.ino); - assert(in); - continue; - } - CDir *dir = cache->get_dirfrag(p->first); - assert(dir); - if (p->second == IMPORT_PREPPING) - continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // export_state - show_exporting(); - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) { - CDir *dir = p->first; - if (p->second == EXPORT_DISCOVERING || - p->second == EXPORT_FREEZING) continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // ambiguous+me subtrees should be importing|exporting - - // write me -} - - - - - -// ========================================================== -// EXPORT - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e) : - mig(m), ex(e) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_frozen(ex); - } -}; - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << endl; - assert(dir->is_auth()); - assert(dest != mds->get_nodeid()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << endl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << endl; - //assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; - return; - } - if (dir->state_test(CDir::STATE_EXPORTING)) { - dout(7) << "already exporting" << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!mds->locker->dentry_can_rdlock_trace(trace)) { - dout(7) << "export_dir couldn't pin path, failing." << endl; - return; - } - - // ok. - mds->locker->dentry_anon_rdlock_trace_start(trace); - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - dir->state_set(CDir::STATE_EXPORTING); - dir->get(CDir::PIN_EXPORTING); - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir), export_peer[dir], MDS_PORT_MIGRATOR); - - // start the freeze, but hold it up with an auth_pin. - dir->auth_pin(); - dir->freeze_tree(new C_MDC_ExportFreeze(this, dir)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_discover_ack from " << m->get_source() - << " on " << *dir << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_DISCOVERING || - export_peer[dir] != m->get_source().num()) { - dout(7) << "must have aborted" << endl; - } else { - // freeze the subtree - export_state[dir] = EXPORT_FREEZING; - dir->auth_unpin(); - } - - delete m; // done -} - -void Migrator::export_frozen(CDir *dir) -{ - dout(7) << "export_frozen on " << *dir << endl; - assert(dir->is_frozen()); - int dest = export_peer[dir]; - - // ok! - cache->show_subtrees(); - - // note the bounds. - // force it into a subtree by listing auth as . - cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); - cache->get_subtree_bounds(dir, export_bounds[dir]); - set &bounds = export_bounds[dir]; - - // generate prep message, log entry. - MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); - - // include list of bystanders - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - p++) { - if (p->first != dest) { - dout(10) << "bystander mds" << p->first << endl; - prep->add_bystander(p->first); - } - } - - // include spanning tree for all nested exports. - // these need to be on the destination _before_ the final export so that - // dir_auth updates on any nested exports are properly absorbed. - // this includes inodes and dirfrags included in the subtree, but - // only the inodes at the bounds. - set inodes_added; - - // include base dirfrag - prep->add_dirfrag( new CDirDiscover(dir, dir->add_replica(dest)) ); - - // check bounds - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bound = *it; - - // pin it. - bound->get(CDir::PIN_EXPORTBOUND); - bound->state_set(CDir::STATE_EXPORTBOUND); - - dout(7) << " export bound " << *bound << endl; - - prep->add_export( bound->dirfrag() ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = bound; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - // inode - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << endl; - - // include the dirfrag? only if it's not the bounding subtree root. - if (cur != bound) { - assert(cur->is_auth()); - prep->add_dirfrag( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! - dout(7) << " added " << *cur << endl; - } - - cur = cur->get_parent_dir(); - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->get_dir()->dirfrag(), - in->parent->get_name(), - in->replicate_to(dest) ); - } - - } - - // send. - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_prep_ack " << *dir << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted" << endl; - delete m; - return; - } - - // send warnings - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - assert(export_warning_ack_waiting.count(dir) == 0); - assert(export_notify_ack_waiting.count(dir) == 0); - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) { - if (p->first == dest) continue; - if (!mds->mdsmap->is_active_or_stopping(p->first)) - continue; // only if active - export_warning_ack_waiting[dir].insert(p->first); - export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too! - - MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), - pair(mds->get_nodeid(),export_peer[dir])); - notify->copy_bounds(export_bounds[dir]); - mds->send_message_mds(notify, p->first, MDS_PORT_MIGRATOR); - - } - export_state[dir] = EXPORT_WARNING; - - // nobody to warn? - if (export_warning_ack_waiting.count(dir) == 0) - export_go(dir); // start export. - - // done. - delete m; -} - - -void Migrator::export_go(CDir *dir) -{ - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - dout(7) << "export_go " << *dir << " to " << dest << endl; - - cache->show_subtrees(); - - export_warning_ack_waiting.erase(dir); - export_state[dir] = EXPORT_EXPORTING; - - assert(export_bounds.count(dir) == 1); - assert(export_data.count(dir) == 0); - - assert(dir->get_cum_auth_pins() == 0); - - // set ambiguous auth - cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); - cache->verify_subtree_bounds(dir, export_bounds[dir]); - - // fill export message with cache data - C_Contexts *fin = new C_Contexts; // collect all the waiters - map exported_client_map; - int num_exported_inodes = encode_export_dir( export_data[dir], - fin, - dir, // base - dir, // recur start point - dest, - exported_client_map ); - bufferlist bl; - ::_encode(exported_client_map, bl); - export_data[dir].push_front(bl); - - // send the export data! - MExportDir *req = new MExportDir(dir->dirfrag()); - - // export state - req->set_dirstate( export_data[dir] ); - - // add bounds to message - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) - req->add_export((*p)->dirfrag()); - - //s end - mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - - // queue up the finisher - dir->add_waiter( CDir::WAIT_UNFREEZE, fin ); - - // take away the popularity we're sending. FIXME: do this later? - mds->balancer->subtract_export(dir); - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - cache->show_subtrees(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: encode_export_dir, file_rename (if foreign) - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth, - map& exported_client_map) -{ - // tell (all) clients about migrating caps.. mark STALE - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl; - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_STALE, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()); - entity_inst_t inst = mds->clientmap.get_inst(it->first); - exported_client_map[it->first] = inst; - mds->send_message_client(m, inst); - } - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // add inode - assert(!in->is_replica(mds->get_nodeid())); - CInodeExport istate( in ); - istate._encode( enc_state ); - - // we're export this inode; fix inode state - dout(7) << "encode_export_inode " << *in << endl; - - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replicas(); - - // twiddle lock states for auth -> replica transition - in->authlock.export_twiddle(); - in->linklock.export_twiddle(); - in->dirfragtreelock.export_twiddle(); - in->filelock.export_twiddle(); - in->dirlock.export_twiddle(); - - // mark auth - assert(in->is_auth()); - in->state_clear(CInode::STATE_AUTH); - in->replica_nonce = CInode::EXPORT_NONCE; - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); -} - - -int Migrator::encode_export_dir(list& dirstatelist, - C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth, - map& exported_client_map) -{ - int num_exported = 0; - - dout(7) << "encode_export_dir " << *dir << " " << dir->nitems << " items" << endl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - bufferlist enc_dir; - - CDirExport dstate(dir); - dstate._encode( enc_dir ); - - // release open_by - dir->clear_replicas(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDir::STATE_AUTH); - dir->replica_nonce = CDir::NONCE_EXPORT; - - list subdirs; - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - list waiting; - dir->take_waiting(CDir::WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // dentries - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "encode_export_dir exporting " << *dn << endl; - - // name - _encode(it->first, enc_dir); - - // state - it->second->encode_export_state(enc_dir); - - // points to... - - // null dentry? - if (dn->is_null()) { - enc_dir.append("N", 1); // null dentry - continue; - } - - if (dn->is_remote()) { - // remote link - enc_dir.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - enc_dir.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - enc_dir.append("I", 1); // inode dentry - - encode_export_inode(in, enc_dir, newauth, exported_client_map); // encode, and (update state for) export - - // directory? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (!dir->state_test(CDir::STATE_EXPORTBOUND)) { - // include nested dirfrag - assert(dir->get_dir_auth().first == CDIR_AUTH_PARENT); - subdirs.push_back(dir); // it's ours, recurse (later) - } - } - - // waiters - list waiters; - in->take_waiting(CInode::WAIT_ANY, waiters); - fin->take(waiters); - } - - // add to dirstatelist - bufferlist bl; - dirstatelist.push_back( bl ); - dirstatelist.back().claim( enc_dir ); - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth, - exported_client_map); - - return num_exported; -} - - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_logged_finish(dir); - } -}; - - -/* - * i should get an export_ack from the export target. - */ -void Migrator::handle_export_ack(MExportDirAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // yay! - dout(7) << "handle_export_ack " << *dir << endl; - - export_warning_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - export_data.erase(dir); - - // log completion - EExport *le = new EExport(dir); - le->metablob.add_dir( dir, false ); - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) { - CDir *bound = *p; - le->get_bounds().insert(bound->dirfrag()); - le->metablob.add_dir_context(bound); - le->metablob.add_dir(bound, false); - } - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - dir->get(CDir::PIN_LOGGINGEXPORTFINISH); - mds->mdlog->submit_entry(le, - new C_MDS_ExportFinishLogged(this, dir)); - - delete m; -} - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::export_reverse(CDir *dir) -{ - dout(7) << "export_reverse " << *dir << endl; - - assert(export_state[dir] == EXPORT_EXPORTING); - assert(export_bounds.count(dir)); - assert(export_data.count(dir)); - - // adjust auth, with possible subtree merge. - cache->verify_subtree_bounds(dir, export_bounds[dir]); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // unpin bounds - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // re-import the metadata - map imported_client_map; - int off = 0; - ::_decode(imported_client_map, export_data[dir].front(), off); - export_data[dir].pop_front(); - - while (!export_data[dir].empty()) { - decode_import_dir(export_data[dir].front(), - export_peer[dir], - dir, // import root - 0, - imported_client_map); - export_data[dir].pop_front(); - } - - // process delayed expires - cache->process_delayed_expire(dir); - - // unfreeze - dir->unfreeze_tree(); - - // some clean up - export_data.erase(dir); - export_bounds.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - cache->show_cache(); -} - - -/* - * once i get the ack, and logged the EExportFinish(true), - * send notifies (if any), otherwise go straight to finish. - * - */ -void Migrator::export_logged_finish(CDir *dir) -{ - dout(7) << "export_logged_finish " << *dir << endl; - dir->put(CDir::PIN_LOGGINGEXPORTFINISH); - - cache->verify_subtree_bounds(dir, export_bounds[dir]); - - // send notifies - int dest = export_peer[dir]; - - for (set::iterator p = export_notify_ack_waiting[dir].begin(); - p != export_notify_ack_waiting[dir].end(); - ++p) { - MExportDirNotify *notify; - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) - // dest is still alive. - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), dest), - pair(dest, CDIR_AUTH_UNKNOWN)); - else - // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(dest, CDIR_AUTH_UNKNOWN)); - - notify->copy_bounds(export_bounds[dir]); - - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } - - // wait for notifyacks - export_state[dir] = EXPORT_NOTIFYING; - - // no notifies to wait for? - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); // skip notify/notify_ack stage. -} - -/* - * warning: - * i'll get an ack from each bystander. - * when i get them all, do the export. - * notify: - * i'll get an ack from each bystander. - * when i get them all, unfreeze and send the finish. - */ -void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - int from = m->get_source().num(); - - if (export_state.count(dir) && export_state[dir] == EXPORT_WARNING) { - // exporting. process warning. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing warning on " - << *dir << endl; - assert(export_warning_ack_waiting.count(dir)); - export_warning_ack_waiting[dir].erase(from); - - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); // start export. - } - else if (export_state.count(dir) && export_state[dir] == EXPORT_NOTIFYING) { - // exporting. process notify. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing notify on " - << *dir << endl; - assert(export_notify_ack_waiting.count(dir)); - export_notify_ack_waiting[dir].erase(from); - - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { - // reversing import - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": aborting import on " - << *dir << endl; - assert(import_bystanders[dir].count(from)); - import_bystanders[dir].erase(from); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - - delete m; -} - - -void Migrator::export_finish(CDir *dir) -{ - dout(7) << "export_finish " << *dir << endl; - - if (export_state.count(dir) == 0) { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; - return; - } - - // send finish/commit to new auth - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) { - mds->send_message_mds(new MExportDirFinish(dir->dirfrag()), - export_peer[dir], MDS_PORT_MIGRATOR); - } else { - dout(7) << "not sending MExportDirFinish, dest has failed" << endl; - } - - // unfreeze - dout(7) << "export_finish unfreezing" << endl; - dir->unfreeze_tree(); - - // unpin bounds - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // adjust auth, with possible subtree merge. - // (we do this _after_ removing EXPORTBOUND pins, to allow merges) - cache->adjust_subtree_auth(dir, export_peer[dir]); - cache->try_subtree_merge(dir); - - // unpin path - dout(7) << "export_finish unpinning path" << endl; - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // discard delayed expires - cache->discard_delayed_expire(dir); - - // remove from exporting list, clean up state - dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); - export_state.erase(dir); - export_peer.erase(dir); - export_bounds.erase(dir); - export_notify_ack_waiting.erase(dir); - - // queue finishers - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // stats - //if (mds->logger) mds->logger->set("nex", cache->exports.size()); - - cache->show_subtrees(); - audit(); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); -} - - - - - - - - -// ========================================================== -// IMPORT - -void Migrator::handle_export_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_discover on " << m->get_path() << endl; - - // note import state - dirfrag_t df = m->get_dirfrag(); - - // only start discovering on this message once. - if (!m->started) { - m->started = true; - import_state[df] = IMPORT_DISCOVERING; - import_peer[df] = m->get_source().num(); - } - - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 && - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << endl; - delete m; - return; - } - - // do we have it? - CInode *in = cache->get_inode(m->get_dirfrag().ino); - if (!in) { - // must discover it! - filepath fpath(m->get_path()); - vector trace; - int r = cache->path_traverse(0, m, - 0, fpath, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (r < 0) { - dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - - CInode *in; - if (trace.empty()) { - in = cache->get_root(); - if (!in) { - cache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - } else { - in = trace[trace.size()-1]->inode; - } - } - - // yay - import_discovered(in, df); - delete m; -} - -void Migrator::import_discovered(CInode *in, dirfrag_t df) -{ - dout(7) << "import_discovered " << df << " inode " << *in << endl; - - // pin inode in the cache (for now) - assert(in->is_dir()); - in->get(CInode::PIN_IMPORTING); - - // reply - dout(7) << " sending export_discover_ack on " << *in << endl; - mds->send_message_mds(new MExportDirDiscoverAck(df), - import_peer[df], MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_cancel(MExportDirCancel *m) -{ - dout(7) << "handle_export_cancel on " << m->get_dirfrag() << endl; - - if (import_state[m->get_dirfrag()] == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(m->get_dirfrag().ino); - assert(in); - in->put(CInode::PIN_IMPORTING); - } else { - assert(import_state[m->get_dirfrag()] == IMPORT_DISCOVERING); - } - - import_state.erase(m->get_dirfrag()); - import_peer.erase(m->get_dirfrag()); - - delete m; -} - - -void Migrator::handle_export_prep(MExportDirPrep *m) -{ - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - - int oldauth = m->get_source().num(); - assert(oldauth != mds->get_nodeid()); - - list finished; - - // assimilate root dir. - CDir *dir; - - if (!m->did_assim()) { - dir = cache->add_replica_dir(diri, - m->get_dirfrag().frag, *m->get_dirfrag_discover(m->get_dirfrag()), - oldauth, finished); - dout(7) << "handle_export_prep on " << *dir << " (first pass)" << endl; - } else { - dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << endl; - } - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << endl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - - // bystander list - import_bystanders[dir] = m->get_bystanders(); - dout(7) << "bystanders are " << import_bystanders[dir] << endl; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); - assert(condir); - cache->add_inode( in ); - condir->add_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << endl; - } - - assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - - // dirs - for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); - pf != m->get_inode_dirfrags(in->ino()).end(); - ++pf) { - // add/update - cache->add_replica_dir(in, *pf, *m->get_dirfrag_discover(dirfrag_t(in->ino(), *pf)), - oldauth, finished); - } - } - - // open export dirs/bounds? - assert(import_bound_inos.count(dir->dirfrag()) == 0); - import_bound_inos[dir->dirfrag()].clear(); - for (list::iterator it = m->get_bounds().begin(); - it != m->get_bounds().end(); - it++) { - dout(7) << " checking bound " << hex << *it << dec << endl; - CInode *in = cache->get_inode(it->ino); - assert(in); - - // note bound. - import_bound_inos[dir->dirfrag()].push_back(*it); - - CDir *dir = cache->get_dirfrag(*it); - if (!dir) { - dout(7) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, it->frag, - new C_MDS_RetryMessage(mds, m)); - } - } - } else { - dout(7) << " not doing assim on " << *dir << endl; - } - - - // verify we have all bounds - int waiting_for = 0; - for (list::iterator it = m->get_bounds().begin(); - it != m->get_bounds().end(); - it++) { - dirfrag_t df = *it; - CDir *bound = cache->get_dirfrag(df); - if (bound) { - if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *bound << endl; - bound->get(CDir::PIN_IMPORTBOUND); - bound->state_set(CDir::STATE_IMPORTBOUND); - import_bounds[dir].insert(bound); - } else { - dout(7) << " already pinned import bound " << *bound << endl; - } - } else { - dout(7) << " waiting for nested export dir on " << *cache->get_inode(df.ino) << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; - } else { - dout(7) << " all ready, noting auth and freezing import region" << endl; - - // note that i am an ambiguous auth for this subtree. - // specify bounds, since the exporter explicitly defines the region. - cache->adjust_bounded_subtree_auth(dir, import_bounds[dir], - pair(oldauth, mds->get_nodeid())); - cache->verify_subtree_bounds(dir, import_bounds[dir]); - - // freeze. - dir->_freeze_tree(); - - // ok! - dout(7) << " sending export_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[dir->dirfrag()] = IMPORT_PREPPED; - - // done - delete m; - } - - // finish waiters - finish_contexts(finished, 0); -} - - - - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; -public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f) : - migrator(m), dir(d), from(f) { - } - void finish(int r) { - migrator->import_logged_start(dir, from); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // start the journal entry - EImportStart *le = new EImportStart(dir->dirfrag(), m->get_bounds()); - le->metablob.add_dir_context(dir); - - // adjust auth (list us _first_) - cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); - cache->verify_subtree_bounds(dir, import_bounds[dir]); - - // add this crap to my cache - map imported_client_map; - int off = 0; - ::_decode(imported_client_map, m->get_dirstate().front(), off); - m->get_dirstate().pop_front(); - - int num_imported_inodes = 0; - while (!m->get_dirstate().empty()) { - num_imported_inodes += - decode_import_dir(m->get_dirstate().front(), - oldauth, - dir, // import root - le, - imported_client_map); - m->get_dirstate().pop_front(); - } - dout(10) << " " << m->get_bounds().size() << " imported bounds" << endl; - - // include bounds in EImportStart - for (set::iterator it = import_bounds[dir].begin(); - it != import_bounds[dir].end(); - it++) { - CDir *bd = *it; - - // include bounding dirs in EImportStart - // (now that the interior metadata is already in the event) - le->metablob.add_dir(bd, false); - } - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << endl; - - // log it - mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num())); - - // note state - import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - //mds->logger->set("nim", cache->imports.size()); - } - - delete m; -} - - -/* - * note: this does teh full work of reversing and import and cleaning up - * state. - * called by both handle_mds_failure and by handle_import_map (if we are - * a survivor coping with an exporter failure+recovery). - */ -void Migrator::import_reverse(CDir *dir, bool fix_dir_auth) -{ - dout(7) << "import_reverse " << *dir << endl; - - // update auth, with possible subtree merge. - if (fix_dir_auth) { - assert(dir->is_subtree_root()); - cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); - cache->try_subtree_merge(dir); - } - - // adjust auth bits. - list q; - q.push_back(dir); - while (!q.empty()) { - CDir *cur = q.front(); - q.pop_front(); - - // dir - assert(cur->is_auth()); - cur->state_clear(CDir::STATE_AUTH); - cur->clear_replicas(); - if (cur->is_dirty()) - cur->mark_clean(); - - CDir_map_t::iterator it; - for (it = cur->begin(); it != cur->end(); it++) { - CDentry *dn = it->second; - - // dentry - dn->state_clear(CDentry::STATE_AUTH); - dn->clear_replicas(); - if (dn->is_dirty()) - dn->mark_clean(); - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - in->state_clear(CDentry::STATE_AUTH); - in->clear_replicas(); - if (in->is_dirty()) - in->mark_clean(); - in->authlock.clear_gather(); - in->linklock.clear_gather(); - in->dirfragtreelock.clear_gather(); - in->filelock.clear_gather(); - - // non-bounding dir? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - if (!(*p)->state_test(CDir::STATE_IMPORTBOUND)) - q.push_back(*p); - } - } - } - - // log our failure - mds->mdlog->submit_entry(new EImportFinish(dir,false)); // log failure - - // bystanders? - if (import_bystanders[dir].empty()) { - dout(7) << "no bystanders, finishing reverse now" << endl; - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - dout(7) << "notifying bystanders of abort" << endl; - import_notify_abort(dir); - import_state[dir->dirfrag()] = IMPORT_ABORTING; - } -} - -void Migrator::import_notify_abort(CDir *dir) -{ - dout(7) << "import_notify_abort " << *dir << endl; - - for (set::iterator p = import_bystanders[dir].begin(); - p != import_bystanders[dir].end(); - ++p) { - // NOTE: the bystander will think i am _only_ auth, because they will have seen - // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). - MExportDirNotify *notify = - new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); - notify->copy_bounds(import_bounds[dir]); - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } -} - -void Migrator::import_reverse_unfreeze(CDir *dir) -{ - dout(7) << "import_reverse_unfreeze " << *dir << endl; - - // unfreeze - dir->unfreeze_tree(); - - // discard expire crap - cache->discard_delayed_expire(dir); - - import_reverse_unpin(dir); -} - -void Migrator::import_reverse_unpin(CDir *dir) -{ - dout(7) << "import_reverse_unpin " << *dir << endl; - - // remove importing pin - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - // remove bound pins - for (set::iterator it = import_bounds[dir].begin(); - it != import_bounds[dir].end(); - it++) { - CDir *bd = *it; - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - } - - // clean up - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bound_inos.erase(dir->dirfrag()); - import_bounds.erase(dir); - import_bystanders.erase(dir); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) -} - - -void Migrator::import_logged_start(CDir *dir, int from) -{ - dout(7) << "import_logged " << *dir << endl; - - // note state - import_state[dir->dirfrag()] = IMPORT_ACKING; - - // send notify's etc. - dout(7) << "sending ack for " << *dir << " to old auth mds" << from << endl; - mds->send_message_mds(new MExportDirAck(dir->dirfrag()), - from, MDS_PORT_MIGRATOR); - - cache->show_subtrees(); -} - - -void Migrator::handle_export_finish(MExportDirFinish *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_finish on " << *dir << endl; - import_finish(dir); - delete m; -} - -void Migrator::import_finish(CDir *dir, bool now) -{ - dout(7) << "import_finish on " << *dir << endl; - - // log finish - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - - // remove pins - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - for (set::iterator it = import_bounds[dir].begin(); - it != import_bounds[dir].end(); - it++) { - CDir *bd = *it; - - // remove bound pin - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - } - - // unfreeze - dir->unfreeze_tree(); - - // adjust auth, with possible subtree merge. - cache->verify_subtree_bounds(dir, import_bounds[dir]); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // clear import state (we're done!) - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bound_inos.erase(dir->dirfrag()); - import_bounds.erase(dir); - import_bystanders.erase(dir); - - // process delayed expires - cache->process_delayed_expire(dir); - - // ok now finish contexts - dout(5) << "finishing any waiters on imported data" << endl; - dir->finish_waiting(CDir::WAIT_IMPORTED); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) - - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth, - map& imported_client_map) -{ - dout(15) << "decode_import_inode on " << *dn << endl; - - CInodeExport istate; - off = istate._decode(bl, off); - - bool added = false; - CInode *in = cache->get_inode(istate.get_ino()); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->state_set(CInode::STATE_AUTH); - } - - // state after link -- or not! -sage - set merged_client_caps; - istate.update_inode(in, merged_client_caps); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << endl; - } else { - dout(10) << " had " << *in << endl; - } - - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CInode::EXPORT_NONCE ); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - - // twiddle locks - /* - if (in->authlock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->authlock); - if (in->linklock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->linklock); - if (in->dirfragtreelock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->dirfragtreelock); - if (in->dirlock.do_import(oldauth, mds->get_nodeid())) - mds->locker->scatter_eval(&in->dirlock); - */ - - // caps - for (set::iterator it = merged_client_caps.begin(); - it != merged_client_caps.end(); - it++) { - MClientFileCaps *caps = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted()); - caps->set_mds( oldauth ); // reap from whom? - mds->send_message_client_maybe_open(caps, imported_client_map[*it]); - } - - // filelock - /* - if (in->filelock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->filelock); - */ -} - - -int Migrator::decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map) -{ - int off = 0; - - // set up dir - CDirExport dstate; - off = dstate._decode(bl, off); - - CInode *diri = cache->get_inode(dstate.get_dirfrag().ino); - assert(diri); - CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, dstate.get_dirfrag().frag); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << endl; - - // assimilate state - dstate.update_dir( dir ); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDir::STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, - true, // Hmm: dirty=false would be okay in some cases - dir->is_complete()); - - int num_imported = 0; - - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDir::WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDir::WAIT_IMPORTED, *it); - - dout(15) << "doing contents" << endl; - - // contents - long nden = dstate.get_nden(); - - for (; nden>0; nden--) { - - num_imported++; - - // dentry - string dname; - _decode(dname, bl, off); - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // decode state - dn->decode_import_state(bl, off, oldauth, mds->get_nodeid()); - dout(15) << "decode_import_dir got " << *dn << endl; - - // points to... - char icode; - bl.copy(off, 1, &icode); - off++; - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - if (dn->is_remote()) { - assert(dn->get_remote_ino() == ino); - } else { - dir->link_inode(dn, ino); - } - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth, imported_client_map); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); - } - - dout(7) << "decode_import_dir done " << *dir << endl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_notify(MExportDirNotify *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - - int from = m->get_source().num(); - pair old_auth = m->get_old_auth(); - pair new_auth = m->get_new_auth(); - - if (!dir) { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on missing dir " << m->get_dirfrag() << endl; - } else if (dir->authority() != old_auth) { - dout(7) << "handle_export_notify old_auth was " << dir->authority() - << " != " << old_auth << " -> " << new_auth - << " on " << *dir << endl; - } else { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on " << *dir << endl; - // adjust auth - cache->adjust_bounded_subtree_auth(dir, m->get_bounds(), new_auth); - - // induce a merge? - cache->try_subtree_merge(dir); - } - - // send ack - if (m->wants_ack()) { - mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), - from, MDS_PORT_MIGRATOR); - } else { - // aborted. no ack. - dout(7) << "handle_export_notify no ack requested" << endl; - } - - delete m; -} - - - diff --git a/branches/sage/pgs/mds/Migrator.h b/branches/sage/pgs/mds/Migrator.h deleted file mode 100644 index a1cb169d642b5..0000000000000 --- a/branches/sage/pgs/mds/Migrator.h +++ /dev/null @@ -1,259 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirCancel; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDir; -class MExportDirAck; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class EImportStart; - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- -public: - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - //const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending - const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack - const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish - const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks - const static int EXPORT_ABORTING = 9; // notifying bystanders of abort - static const char *get_export_statename(int s) { - switch (s) { - case EXPORT_DISCOVERING: return "discovering"; - case EXPORT_FREEZING: return "freezing"; - case EXPORT_PREPPING: return "prepping"; - case EXPORT_WARNING: return "warning"; - case EXPORT_EXPORTING: return "exporting"; - case EXPORT_LOGGINGFINISH: return "loggingfinish"; - case EXPORT_NOTIFYING: return "notifying"; - case EXPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - // export fun - map export_state; - map export_peer; - map > export_bounds; - map > export_data; // only during EXPORTING state - map > export_warning_ack_waiting; - map > export_notify_ack_waiting; - - map > export_finish_waiters; - - - // -- imports -- -public: - const static int IMPORT_DISCOVERING = 1; // waiting for prep - const static int IMPORT_DISCOVERED = 2; // waiting for prep - const static int IMPORT_PREPPING = 3; // opening dirs on bounds - const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart - const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish - //const static int IMPORT_LOGGINGFINISH = 7; // logging EImportFinish - const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing - static const char *get_import_statename(int s) { - switch (s) { - case IMPORT_DISCOVERING: return "discovering"; - case IMPORT_DISCOVERED: return "discovered"; - case IMPORT_PREPPING: return "prepping"; - case IMPORT_PREPPED: return "prepped"; - case IMPORT_LOGGINGSTART: return "loggingstart"; - case IMPORT_ACKING: return "acking"; - case IMPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - map import_state; // FIXME make these dirfrags - map import_peer; - map > import_bound_inos; - map > import_bounds; - map > import_bystanders; - - - /* - // -- hashing madness -- - multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from - multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir - // maps frozen_dir_ino's to waiting-for-discover ino's. - multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - */ - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - void show_importing(); - void show_exporting(); - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(dirfrag_t df) { - if (import_state.count(df)) return import_state[df]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - const list& get_import_bound_inos(dirfrag_t base) { - assert(import_bound_inos.count(base)); - return import_bound_inos[base]; - } - const set& get_import_bounds(CDir *base) { - assert(import_bounds.count(base)); - return import_bounds[base]; - } - - int get_import_state(dirfrag_t df) { - assert(import_state.count(df)); - return import_state[df]; - } - int get_import_peer(dirfrag_t df) { - assert(import_peer.count(df)); - return import_peer[df]; - } - - int get_export_state(CDir *dir) { - assert(export_state.count(dir)); - return export_state[dir]; - } - // this returns true if we are export @dir, - // and are not waiting for @who to be - // be warned of ambiguous auth. - // only returns meaningful results during EXPORT_WARNING state. - bool export_has_warned(CDir *dir, int who) { - assert(is_exporting(dir)); - assert(export_state[dir] == EXPORT_WARNING); - return (export_warning_ack_waiting[dir].count(who) == 0); - } - - - // -- misc -- - void handle_mds_failure_or_stop(int who); - - void audit(); - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, int dest); - void export_empty_import(CDir *dir); - - void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth, - map& exported_client_map); - int encode_export_dir(list& dirstatelist, - class C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth, - map& exported_client_map); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - protected: - void handle_export_discover_ack(MExportDirDiscoverAck *m); - void export_frozen(CDir *dir); - void handle_export_prep_ack(MExportDirPrepAck *m); - void export_go(CDir *dir); - void export_reverse(CDir *dir); - void handle_export_ack(MExportDirAck *m); - void export_logged_finish(CDir *dir); - void handle_export_notify_ack(MExportDirNotifyAck *m); - void export_finish(CDir *dir); - - friend class C_MDC_ExportFreeze; - friend class C_MDS_ExportFinishLogged; - - - // importer - void handle_export_discover(MExportDirDiscover *m); - void handle_export_cancel(MExportDirCancel *m); - void import_discovered(CInode *in, dirfrag_t df); - void handle_export_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - -public: - void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth, - map& imported_client_map); - int decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map); - -public: - void import_reverse(CDir *dir, bool fix_dir_auth=true); -protected: - void import_reverse_unfreeze(CDir *dir); - void import_reverse_unpin(CDir *dir); - void import_notify_abort(CDir *dir); - void import_logged_start(CDir *dir, int from); - void handle_export_finish(MExportDirFinish *m); -public: - void import_finish(CDir *dir, bool now=false); -protected: - - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - - // bystander - void handle_export_notify(MExportDirNotify *m); - -}; - - -#endif diff --git a/branches/sage/pgs/mds/Renamer.cc b/branches/sage/pgs/mds/Renamer.cc deleted file mode 100644 index 534a608b8e6bd..0000000000000 --- a/branches/sage/pgs/mds/Renamer.cc +++ /dev/null @@ -1,905 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "MDS.h" -#include "MDSMap.h" -#include "MDLog.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "events/EString.h" -#include "events/EUnlink.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenameAck.h" -#include "messages/MRenameReq.h" -#include "messages/MRenamePrep.h" - - - -void Renamer::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_RENAMEWARNING: - handle_rename_warning((MRenameWarning*)m); - break; - case MSG_MDS_RENAMENOTIFY: - handle_rename_notify((MRenameNotify*)m); - break; - case MSG_MDS_RENAMENOTIFYACK: - handle_rename_notify_ack((MRenameNotifyAck*)m); - break; - case MSG_MDS_RENAME: - handle_rename((MRename*)m); - break; - case MSG_MDS_RENAMEREQ: - handle_rename_req((MRenameReq*)m); - break; - case MSG_MDS_RENAMEPREP: - handle_rename_prep((MRenamePrep*)m); - break; - case MSG_MDS_RENAMEACK: - handle_rename_ack((MRenameAck*)m); - break; - - default: - assert(0); - } -} - - -// renaming! - - -/* - fix_renamed_dir(): - - caller has already: - - relinked inode in new location - - fixed in->is_auth() - - set dir_auth, if appropriate - - caller has not: - - touched in->dir - - updated import/export tables -*/ -void Renamer::fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth - int dir_auth) // dir auth (for certain cases) -{ - dout(7) << "fix_renamed_dir on " << *in << endl; - dout(7) << "fix_renamed_dir on " << *in->dir << endl; - - - assert(0); // rewrite . - - // 1- fix subtree tree. - // 2- adjust subtree auth. - - /* - if (in->dir->is_auth()) { - // dir ours - dout(7) << "dir is auth" << endl; - assert(!in->dir->is_export()); - - if (in->is_auth()) { - // inode now ours - if (authchanged) { - // inode _was_ replica, now ours - dout(7) << "inode was replica, now ours." << endl; - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - } else { - // inode was ours, still ours. - dout(7) << "inode was ours, still ours." << endl; - - assert(!in->dir->is_import()); - assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT); - - // move any exports nested beneath me? - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - if (newcon != oldcon) { - dout(7) << "moving nested exports under new container" << endl; - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under new container" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[newcon].insert(*it); - } - } - } - - } else { - // inode now replica - - if (authchanged) { - // inode was ours, but now replica - dout(7) << "inode was ours, now replica. adding to import list." << endl; - - // i am now an import - cache->imports.insert(in->dir); - in->dir->state_set(CDir::STATE_IMPORT); - in->dir->get(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( mds->get_nodeid() ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // find old import - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - dout(7) << " oldcon is " << *oldcon << endl; - - // move nested exports under me - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under me" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[in->dir].insert(*it); - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. doing nothing." << endl; - assert(in->dir->is_import()); - - // verify dir_auth - assert(in->dir->get_dir_auth().first == mds->get_nodeid()); // me, because i'm auth for dir. - assert(in->authority() != in->dir->get_dir_auth()); // inode not me. - } - - assert(in->dir->is_import()); - } - - } else { - // dir is not ours - dout(7) << "dir is not auth" << endl; - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode was replica, now ours - dout(7) << "inode was replica, now ours. now an export." << endl; - assert(!in->dir->is_export()); - - // now export - cache->exports.insert(in->dir); - in->dir->state_set(CDir::STATE_EXPORT); - in->dir->get(CDir::PIN_EXPORT); - - assert(dir_auth >= 0); // better be defined - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - cache->nested_exports[newcon].insert(in->dir); - - } else { - // inode was ours, still ours - dout(7) << "inode was ours, still ours. did my import change?" << endl; - - // sanity - assert(in->dir->is_export()); - assert(in->dir->get_dir_auth().first >= 0); - assert(in->dir->get_dir_auth() != in->authority()); - - // moved under new import? - CDir *oldcon = cache->get_auth_container(srcdir); - CDir *newcon = cache->get_auth_container(in->dir); - if (oldcon != newcon) { - dout(7) << "moving myself under new import " << *newcon << endl; - cache->nested_exports[oldcon].erase(in->dir); - cache->nested_exports[newcon].insert(in->dir); - } - } - - assert(in->dir->is_export()); - } else { - // inode now replica - - if (authchanged) { - // inode was ours, now replica - dout(7) << "inode was ours, now replica. removing from export list." << endl; - assert(in->dir->is_export()); - - // remove from export list - cache->exports.erase(in->dir); - in->dir->state_clear(CDir::STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - assert(cache->nested_exports[oldcon].count(in->dir) == 1); - cache->nested_exports[oldcon].erase(in->dir); - - // simplify dir_auth - if (in->authority() == in->dir->authority()) { - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; - } else { - assert(in->dir->get_dir_auth().first >= 0); // someone else's export, - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. do nothing." << endl; - - // fix dir_auth? - if (in->authority().first == dir_auth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << dir_auth << endl; - - // do nothing. - } - - assert(!in->dir->is_export()); - } - } - */ - cache->show_subtrees(); -} - -/* - * when initiator gets an ack back for a foreign rename - */ - -class C_MDC_RenameNotifyAck : public Context { - Renamer *rn; - CInode *in; - int initiator; - -public: - C_MDC_RenameNotifyAck(Renamer *r, - CInode *i, int init) : rn(r), in(i), initiator(init) {} - void finish(int r) { - rn->file_rename_ack(in, initiator); - } -}; - - - -/************** initiator ****************/ - -/* - * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.) - */ -class C_MDC_RenameAck : public Context { - Renamer *mdc; - CDir *srcdir; - CInode *in; - Context *c; -public: - C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) { - this->mdc = mdc; - this->srcdir = srcdir; - this->in = in; - this->c = c; - } - void finish(int r) { - mdc->file_rename_finish(srcdir, in, c); - } -}; - - -void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) -{ - assert(srcdn->is_xlocked()); // by me - assert(destdn->is_xlocked()); // by me - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - CDir *destdir = destdn->dir; - string destname = destdn->name; - - CInode *in = srcdn->inode; - //Message *req = srcdn->xlockedby; - - - // determine the players - int srcauth = srcdir->dentry_authority(srcdn->name).first; - int destauth = destdir->dentry_authority(destname).first; - - - // FOREIGN rename? - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl; - - string destpath; - destdn->make_path(destpath); - - if (destauth != mds->get_nodeid()) { - // make sure dest has dir open. - dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl; - - // prep dest first, they must have the dir open! rest will follow. - string srcpath; - srcdn->make_path(srcpath); - - MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, srcpath, - destdir->ino(), destname, destpath, - srcauth); // tell dest who src is (maybe even me) - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - cache->show_subtrees(); - - } - - else if (srcauth != mds->get_nodeid()) { - if (destauth == mds->get_nodeid()) { - dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; - } else { - dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; - } - - // srcdn not important on destauth, just request - MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, - destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know) - mds->send_message_mds(m, srcauth, MDS_PORT_CACHE); - } - - else - assert(0); - - // set waiter on the inode (is this the best place?) - in->add_waiter(CInode::WAIT_RENAMEACK, - new C_MDC_RenameAck(this, - srcdir, in, onfinish)); - return; - } - - // LOCAL rename! - assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid()); - dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; - - // update our cache - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change - - // mark dentries dirty - srcdn->_mark_dirty(); // fixme - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - - // local, restrict notify to ppl with open dirs - set notify; - for (map::iterator it = srcdir->replicas_begin(); - it != srcdir->replicas_end(); - ++it) - notify.insert(it->first); - for (map::iterator it = destdir->replicas_begin(); - it != destdir->replicas_end(); - it++) - if (notify.count(it->first) == 0) notify.insert(it->first); - - if (notify.size()) { - // warn + notify - file_rename_warn(in, notify); - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); - - // wait for MRenameNotifyAck's - in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator - - // wait for finish - in->add_waiter(CInode::WAIT_RENAMEACK, - new C_MDC_RenameAck(this, srcdir, in, onfinish)); - } else { - // sweet, no notify necessary, we're done! - file_rename_finish(srcdir, in, onfinish); - } -} - -void Renamer::handle_rename_ack(MRenameAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_rename_ack on " << *in << endl; - - // all done! - in->finish_waiting(CInode::WAIT_RENAMEACK); - - delete m; -} - -void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) -{ - dout(10) << "file_rename_finish on " << *in << endl; - - // did i empty out an imported dir? FIXME this check should go somewhere else??? - if (srcdir->is_auth() && !srcdir->inode->is_auth() && srcdir->get_size() == 0) - cache->migrator->export_empty_import(srcdir); - - // finish our caller - if (c) { - c->finish(0); - delete c; - } -} - - -/************* src **************/ - - -/** handle_rename_req - * received by auth of src dentry (from init, or destauth if dir). - * src may not have dest dir open. - * src will export inode, unlink|rename, and send MRename to dest. - */ -void Renamer::handle_rename_req(MRenameReq *m) -{ - // i am auth, i will have it. - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - assert(srcdn); - - // do it - file_rename_foreign_src(srcdn, - m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), - m->get_initiator()); - delete m; -} - - -void Renamer::file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator) -{ - dout(7) << "file_rename_foreign_src " << *srcdn << endl; - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - // (we're basically exporting this inode) - CInode *in = srcdn->inode; - assert(in); - assert(in->is_auth()); - - if (in->is_dir()) cache->show_subtrees(); - - // encode and export inode state - bufferlist inode_state; - cache->migrator->encode_export_inode(in, inode_state, destauth); - - // send - MRename *m = new MRename(initiator, - srcdir->ino(), srcdn->name, destdirino, destname, - inode_state); - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - // have dest? - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // discover src - if (!destdn) { - dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl; - - filepath destfilepath = destpath; - vector trace; - int r = cache->path_traverse(destfilepath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - assert(destdn); - - // update our cache - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, true); // auth changed - - srcdn->_mark_dirty(); // fixme - - // proxy! - //in->state_set(CInode::STATE_PROXY); - //in->get(CInode::PIN_PROXY); - - // generate notify list (everybody but src|dst) and send warnings - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != mds->get_nodeid() && // except the source - i != destauth) // and the dest - notify.insert(i); - } - file_rename_warn(in, notify); - - - // wait for MRenameNotifyAck's - in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, initiator)); -} - -void Renamer::file_rename_warn(CInode *in, - set& notify) -{ - // note gather list - rename_waiting_for_ack[in->ino()] = notify; - - // send - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_warn to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE); - } -} - - -void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - dout(7) << "handle_rename_notify_ack on " << *in << endl; - - int source = m->get_source().num(); - rename_waiting_for_ack[in->ino()].erase(source); - if (rename_waiting_for_ack[in->ino()].empty()) { - // last one! - rename_waiting_for_ack.erase(in->ino()); - in->finish_waiting(CInode::WAIT_RENAMENOTIFYACK, 0); - } else { - dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; - } -} - - -void Renamer::file_rename_ack(CInode *in, int initiator) -{ - // we got all our MNotifyAck's. - - // was i proxy (if not, it's cuz this was a local rename) - /*if (in->state_test(CInode::STATE_PROXY)) { - dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - }*/ - - // done! - if (initiator == mds->get_nodeid()) { - // it's me, finish - dout(7) << "file_rename_ack i am initiator, finishing" << endl; - in->finish_waiting(CInode::WAIT_RENAMEACK); - } else { - // send ack - dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; - mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE); - } -} - - - - -/************ dest *************/ - -/** handle_rename_prep - * received by auth of dest dentry to make sure they have src + dir open. - * this is so that when they get the inode and dir, they can update exports etc properly. - * will send MRenameReq to src. - */ -void Renamer::handle_rename_prep(MRenamePrep *m) -{ - // open src - filepath srcpath = m->get_srcpath(); - vector trace; - int r = cache->path_traverse(srcpath, trace, false, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - - if (r>0) return; - - // ok! - CInode *srcin = trace[trace.size()-1]->inode; - assert(srcin); - - dout(7) << "handle_rename_prep have srcin " << *srcin << endl; - - if (srcin->is_dir()) { - if (!srcin->dir) { - dout(7) << "handle_rename_prep need to open dir" << endl; - cache->open_remote_dir(srcin, frag_t(), // FIXME dirfrag - new C_MDS_RetryMessage(mds,m)); - return; - } - - dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl; - } - - // pin - srcin->get(CInode::PIN_RENAMESRC); - - // send rename request - MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator - m->get_srcdirino(), m->get_srcname(), - m->get_destdirino(), m->get_destname(), m->get_destpath(), - mds->get_nodeid()); // i am dest - mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE); - delete m; - return; -} - - - -/** handle_rename - * received by auth of dest dentry. includes exported inode info. - * dest may not have srcdir open. - */ -void Renamer::handle_rename(MRename *m) -{ - // srcdn (required) - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - string srcname = srcdn->name; - assert(srcdn && srcdn->inode); - - dout(7) << "handle_rename srcdn " << *srcdn << endl; - - // destdn (required). i am auth, so i will have it. - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = destdiri->dir; - CDentry *destdn = destdir->lookup(m->get_destname()); - string destname = destdn->name; - assert(destdn); - - dout(7) << "handle_rename destdn " << *destdn << endl; - - // note old dir auth - int old_dir_auth = -1; - if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority().first; - - // rename replica into position - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // decode + import inode (into new location start) - int off = 0; - // HACK - bufferlist bufstate; - bufstate.claim_append(m->get_inode_state()); - cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num()); - - CInode *in = destdn->inode; - assert(in); - - // update imports/exports? - if (in->is_dir()) { - assert(in->dir); // i had better already ahve it open.. see MRenamePrep - fix_renamed_dir(srcdir, in, destdir, true, // auth changed - old_dir_auth); // src is possibly new dir auth. - } - - // mark dirty - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - // unpin - in->put(CInode::PIN_RENAMESRC); - - // ok, send notifies. - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != m->get_source().num() && // except the source - i != mds->get_nodeid()) // and the dest - notify.insert(i); - } - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num()); - - delete m; -} - - -void Renamer::file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, - int srcauth) -{ - /* NOTE: notify list might include myself */ - - // tell - string destdirpath; - destdir->inode->make_path(destdirpath); - - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_notify to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameNotify(in->ino(), - srcdir->ino(), - srcname, - destdir->ino(), - destdirpath, - destname, - srcauth), - *it, MDS_PORT_CACHE); - } -} - - - -/************** bystanders ****************/ - -void Renamer::handle_rename_warning(MRenameWarning *m) -{ - // add to warning list - stray_rename_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_rename_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl; - - handle_rename_notify(stray_rename_notifies[m->get_ino()]); - stray_rename_notifies.erase(m->get_ino()); - } else { - dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Renamer::handle_rename_notify(MRenameNotify *m) -{ - // FIXME: when we do hard links, i think we need to - // have srcdn and destdn both, or neither, always! - - // did i see the warning yet? - if (!stray_rename_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_rename_notifies[m->get_ino()] = m; - return; - } - - dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; - - // src - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = 0; - if (srcdiri) srcdir = srcdiri->dir; - CDentry *srcdn = 0; - if (srcdir) srcdn = srcdir->lookup(m->get_srcname()); - - // dest - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // have both? - list finished; - if (srcdn && destdir) { - CInode *in = srcdn->inode; - - int old_dir_auth = -1; - if (in && in->dir) old_dir_auth = in->dir->authority().first; - - if (!destdn) { - destdn = destdir->add_dentry(m->get_destname()); // create null dentry - destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked! - } - - dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl; - - if (in) { - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in && in->is_dir() && in->dir) { - fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change - } - } else { - dout(7) << " i don't have the inode (just null dentries)" << endl; - } - - } - - else if (srcdn) { - dout(7) << "handle_rename_notify no dest, but have src" << endl; - dout(7) << "srcdn is " << *srcdn << endl; - - if (destdiri) { - dout(7) << "have destdiri, opening dir " << *destdiri << endl; - cache->open_remote_dir(destdiri, frag_t(), // FIXME dirfrag - new C_MDS_RetryMessage(mds,m)); - } else { - filepath destdirpath = m->get_destdirpath(); - dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; - - vector trace; - int r = cache->path_traverse(destdirpath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - return; - } - - else if (destdn) { - dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl; - if (destdn->inode) { - destdir->unlink_inode(destdn); - } - } - - else { - dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl; - assert(srcdn == 0 && destdn == 0); - } - - mds->queue_finished(finished); - - - // ack - dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl; - MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino()); - mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE); - - - stray_rename_warnings.erase( m->get_ino() ); - delete m; -} - - - - diff --git a/branches/sage/pgs/mds/Renamer.h b/branches/sage/pgs/mds/Renamer.h deleted file mode 100644 index f6f82c31ba9fc..0000000000000 --- a/branches/sage/pgs/mds/Renamer.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_RENAMER_H -#define __MDS_RENAMER_H - -#include "include/types.h" - -#include -#include -using std::map; -using std::set; - -class MDS; -class MDCache; -class CDentry; -class CInode; -class CDir; - -class Message; -class MRenameWarning; -class MRenameNotify; -class MRenameNotifyAck; -class MRename; -class MRenamePrep; -class MRenameReq; -class MRenameAck; - -class Renamer { - MDS *mds; - MDCache *cache; - - // rename fun - set stray_rename_warnings; // notifies i haven't seen - map stray_rename_notifies; - - map > rename_waiting_for_ack; - - - - void fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth changed - int dirauth=-1); // dirauth (for certain cases) - - -public: - Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message *m); - - // RENAME - // initiator - public: - void file_rename(CDentry *srcdn, CDentry *destdn, Context *c); - protected: - void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) - void file_rename_finish(CDir *srcdir, CInode *in, Context *c); - friend class C_MDC_RenameAck; - - // src - void handle_rename_req(MRenameReq *m); // dest -> src - void file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator); - void file_rename_warn(CInode *in, set& notify); - void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src - void file_rename_ack(CInode *in, int initiator); - friend class C_MDC_RenameNotifyAck; - - // dest - void handle_rename_prep(MRenamePrep *m); // init -> dest - void handle_rename(MRename *m); // src -> dest - void file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, int srcauth); - - // bystander - void handle_rename_warning(MRenameWarning *m); // src -> bystanders - void handle_rename_notify(MRenameNotify *m); // dest -> bystanders - - -}; - -#endif - - diff --git a/branches/sage/pgs/mds/ScatterLock.h b/branches/sage/pgs/mds/ScatterLock.h deleted file mode 100644 index 56153ebef8409..0000000000000 --- a/branches/sage/pgs/mds/ScatterLock.h +++ /dev/null @@ -1,174 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SCATTERLOCK_H -#define __SCATTERLOCK_H - -#include "SimpleLock.h" - - -// lock state machine states: -// Sync -- Lock -- sCatter -// Tempsync _/ -// auth repl -#define LOCK_SYNC__ // R . R . rdlocks allowed on auth and replicas -#define LOCK_GLOCKS -20 // r . r . waiting for replicas+rdlocks (auth), or rdlocks to release (replica) -#define LOCK_GSCATTERS -28 // r . r . - -#define LOCK_GSYNCL__ // . w LOCK on replica. -#define LOCK_LOCK__ // . W . . -#define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. - -#define LOCK_GLOCKC -22 // . wp . wp waiting for replicas+wrlocks (auth), or wrlocks to release (replica) -#define LOCK_SCATTER 23 // . Wp . WP mtime updates on replicas allowed, no reads. stable here. -#define LOCK_GTEMPSYNCC -24 // . wp . wp GLOCKC|LOCK on replica - -#define LOCK_GSCATTERT -25 // r . LOCK on replica. -#define LOCK_GLOCKT -26 // r . LOCK on replica. -#define LOCK_TEMPSYNC 27 // R . LOCK on replica. - - -inline const char *get_scatterlock_state_name(int s) { - switch(s) { - case LOCK_SYNC: return "Sync"; - case LOCK_GLOCKS: return "gLockS"; - case LOCK_GSCATTERS: return "gScatterS"; - - case LOCK_GSYNCL: return "gSyncL"; - case LOCK_LOCK: return "Lock"; - case LOCK_GTEMPSYNCL: return "gTempsyncL"; - - case LOCK_GLOCKC: return "gLockC"; - case LOCK_SCATTER: return "sCatter"; - case LOCK_GTEMPSYNCC: return "gTempsyncC"; - - case LOCK_GSCATTERT: return "gsCatterT"; - case LOCK_GLOCKT: return "gLockT"; - case LOCK_TEMPSYNC: return "Tempsync"; - - default: assert(0); - } -} - -class ScatterLock : public SimpleLock { - int num_wrlock; - bool updated; - -public: - ScatterLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0), - updated(false) {} - - int get_replica_state() { - switch (state) { - case LOCK_SYNC: - return LOCK_SYNC; - - case LOCK_GSCATTERS: // hrm. - case LOCK_GLOCKS: - case LOCK_GSYNCL: - case LOCK_LOCK: - case LOCK_GTEMPSYNCL: - case LOCK_GLOCKC: - return LOCK_LOCK; - - case LOCK_SCATTER: - return LOCK_SCATTER; - - case LOCK_GTEMPSYNCC: - case LOCK_GSCATTERT: - case LOCK_GLOCKT: - case LOCK_TEMPSYNC: - return LOCK_LOCK; - default: - assert(0); - } - } - - void set_updated() { - if (!updated) { - parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = true; - } - } - void clear_updated() { - if (updated) { - parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = false; - } - } - bool is_updated() { return updated; } - - void replicate_relax() { - //if (state == LOCK_SYNC && !is_rdlocked()) - //state = LOCK_SCATTER; - } - - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // rdlock - bool can_rdlock(MDRequest *mdr) { - return state == LOCK_SYNC || state == LOCK_TEMPSYNC; - } - bool can_rdlock_soon() { - return state == LOCK_GTEMPSYNCC; - } - - // xlock - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKC || - state == LOCK_GLOCKS); - else - return false; - } - - // wrlock - bool can_wrlock() { - return state == LOCK_SCATTER || state == LOCK_LOCK; - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_scatterlock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/pgs/mds/Server.cc b/branches/sage/pgs/mds/Server.cc deleted file mode 100644 index 808d1337e53cd..0000000000000 --- a/branches/sage/pgs/mds/Server.cc +++ /dev/null @@ -1,3762 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "IdAllocator.h" - -#include "msg/Messenger.h" - -#include "messages/MClientSession.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MLock.h" - -#include "messages/MDentryUnlink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/ESession.h" -#include "events/EOpen.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_CLIENT_RECONNECT: - handle_client_reconnect((MClientReconnect*)m); - return; - } - - // active? - if (!mds->is_active()) { - dout(3) << "not active yet, waiting" << endl; - mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - return; - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - case MSG_MDS_SLAVE_REQUEST: - handle_slave_request((MMDSSlaveRequest*)m); - return; - } - - dout(1) << "server unknown message " << m->get_type() << endl; - assert(0); -} - - - -// ---------------------------------------------------------- -// SESSION management - - -class C_MDS_session_finish : public Context { - MDS *mds; - entity_inst_t client_inst; - bool open; - version_t cmapv; -public: - C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : - mds(m), client_inst(ci), open(s), cmapv(mv) { } - void finish(int r) { - assert(r == 0); - mds->server->_session_logged(client_inst, open, cmapv); - } -}; - - -void Server::handle_client_session(MClientSession *m) -{ - dout(3) << "handle_client_session " << *m << " from " << m->get_source() << endl; - int from = m->get_source().num(); - bool open = m->op == MClientSession::OP_REQUEST_OPEN; - - if (open) { - if (mds->clientmap.have_session(from)) { - dout(10) << "already open, dropping this req" << endl; - delete m; - return; - } - if (mds->clientmap.is_opening(from)) { - dout(10) << "already opening, dropping this req" << endl; - delete m; - return; - } - mds->clientmap.add_opening(from); - } else { - if (mds->clientmap.is_closing(from)) { - dout(10) << "already closing, dropping this req" << endl; - delete m; - return; - } - if (m->seq < mds->clientmap.get_push_seq(from)) { - dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from) - << ", dropping" << endl; - delete m; - return; - } - assert(m->seq == mds->clientmap.get_push_seq(from)); - - mds->clientmap.add_closing(from); - } - - // journal it - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), - new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); - delete m; -} - -void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) -{ - dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") - << " " << cmapv - << endl; - - // apply - int from = client_inst.name.num(); - if (open) { - assert(mds->clientmap.is_opening(from)); - mds->clientmap.open_session(client_inst); - } else { - assert(mds->clientmap.is_closing(from)); - mds->clientmap.close_session(from); - - // purge completed requests from clientmap - mds->clientmap.trim_completed_requests(from, 0); - } - - assert(cmapv == mds->clientmap.get_version()); - - // reply - if (open) - mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst); - else - mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst); -} - - -void Server::terminate_sessions() -{ - dout(2) << "terminate_sessions" << endl; - - // kill them off. clients will retry etc. - for (set::const_iterator p = mds->clientmap.get_session_set().begin(); - p != mds->clientmap.get_session_set().end(); - ++p) { - if (mds->clientmap.is_closing(*p)) - continue; - mds->clientmap.add_closing(*p); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); - } -} - - -void Server::reconnect_clients() -{ - // reconnect with clients - if (mds->clientmap.get_session_set().empty()) { - dout(7) << "reconnect_clients -- no sessions, doing nothing." << endl; - reconnect_gather_finish(); - return; - } - - dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << endl; - - mds->bcast_mds_map(); // send mdsmap to all client sessions - - // init gather list - reconnect_start = g_clock.now(); - client_reconnect_gather = mds->clientmap.get_session_set(); -} - -void Server::handle_client_reconnect(MClientReconnect *m) -{ - dout(7) << "handle_client_reconnect " << m->get_source() << endl; - int from = m->get_source().num(); - - if (m->closed) { - dout(7) << " client had no session, removing from clientmap" << endl; - - mds->clientmap.add_closing(from); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); - - } else { - - // caps - for (map::iterator p = m->inode_caps.begin(); - p != m->inode_caps.end(); - ++p) { - CInode *in = mdcache->get_inode(p->first); - if (in && in->is_auth()) { - // we recovered it, and it's ours. take note. - dout(15) << "open caps on " << *in << endl; - in->reconnect_cap(from, p->second); - reconnected_caps.insert(in); - continue; - } - - filepath path = m->inode_path[p->first]; - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { - // not mine. - dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first] - << ", will pass off to authority" << endl; - - // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; - MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_STALE, - fake_inode, - 0, - 0, // doesn't matter. - p->second.wanted); // doesn't matter. - mds->send_message_client(stale, m->get_source_inst()); - - // add to cap export list. - mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second); - } else { - // mine. fetch later. - dout(0) << "missing " << p->first << " " << m->inode_path[p->first] - << " (mine), will load later" << endl; - mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second, - -1); // "from" me. - } - } - } - - // remove from gather set - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) reconnect_gather_finish(); - - delete m; -} - -/* - * called by mdcache, late in rejoin (right before acks are sent) - */ -void Server::process_reconnected_caps() -{ - dout(10) << "process_reconnected_caps" << endl; - - // adjust filelock state appropriately - for (set::iterator p = reconnected_caps.begin(); - p != reconnected_caps.end(); - ++p) { - CInode *in = *p; - int issued = in->get_caps_issued(); - if (in->is_auth()) { - // wr? - if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { - if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { - in->filelock.set_state(LOCK_LONER); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } else { - // note that client should perform stale/reap cleanup during reconnect. - assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? - if (in->filelock.is_xlocked()) - in->filelock.set_state(LOCK_LOCK); - else - in->filelock.set_state(LOCK_SYNC); // might have been lock, previously - } - dout(15) << " issued " << cap_string(issued) - << " chose " << in->filelock - << " on " << *in << endl; - } - reconnected_caps.clear(); // clean up -} - - -void Server::client_reconnect_failure(int from) -{ - dout(5) << "client_reconnect_failure on client" << from << endl; - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) - reconnect_gather_finish(); -} - -void Server::reconnect_gather_finish() -{ - dout(7) << "reconnect_gather_finish" << endl; - mds->reconnect_done(); -} - - - -/******* - * some generic stuff for finishing off requests - */ - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) -{ - reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) -{ - MClientRequest *req = mdr->client_request; - - dout(10) << "reply_request " << reply->get_result() - << " (" << strerror(-reply->get_result()) - << ") " << *req << endl; - - // note result code in clientmap? - if (!req->is_idempotent()) - mds->clientmap.add_completed_request(mdr->reqid); - - // include trace - if (tracei) { - reply->set_trace_dist( tracei, mds->get_nodeid() ); - } - - // send reply - messenger->send_message(reply, req->get_client_inst()); - - // finish request - mdcache->request_finish(mdr); -} - - - - - -/*** - * process a client request - */ -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "handle_client_request " << *req << endl; - int client = req->get_client(); - - if (!mds->is_active()) { - dout(5) << " not active, discarding client request." << endl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << endl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // active session? - if (!mds->clientmap.have_session(client)) { - dout(1) << "no session for client" << client << ", dropping" << endl; - delete req; - return; - } - - - // okay, i want - CInode *ref = 0; - - // retry? - if (req->get_retry_attempt()) { - if (mds->clientmap.have_completed_request(req->get_reqid())) { - dout(5) << "already completed " << req->get_reqid() << endl; - mds->messenger->send_message(new MClientReply(req, 0), - req->get_client_inst()); - delete req; - return; - } - } - // trim completed_request list - if (req->get_oldest_client_tid() > 0) { - dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << endl; - mds->clientmap.trim_completed_requests(client, - req->get_oldest_client_tid()); - } - - - // ----- - // some ops are on ino's - switch (req->get_op()) { - case MDS_OP_FSTAT: - ref = mdcache->get_inode(req->args.fstat.ino); - assert(ref); - break; - - case MDS_OP_TRUNCATE: - if (!req->args.truncate.ino) - break; // can be called w/ either fh OR path - ref = mdcache->get_inode(req->args.truncate.ino); - assert(ref); - break; - - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? - assert(ref); - break; - } - - // register + dispatch - MDRequest *mdr = mdcache->request_start(req); - - if (ref) { - dout(10) << "inode op on ref " << *ref << endl; - mdr->ref = ref; - mdr->pin(ref); - } - - dispatch_client_request(mdr); - return; -} - - -void Server::dispatch_client_request(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - if (mdr->ref) { - dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << endl; - } else { - dout(7) << "dispatch_client_request " << *req << endl; - } - - // we shouldn't be waiting on anyone. - assert(mdr->waiting_on_slave.empty()); - - switch (req->get_op()) { - - // inodes ops. - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(mdr); - break; - case MDS_OP_UTIME: - handle_client_utime(mdr); - break; - case MDS_OP_CHMOD: - handle_client_chmod(mdr); - break; - case MDS_OP_CHOWN: - handle_client_chown(mdr); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(mdr); - break; - case MDS_OP_READDIR: - handle_client_readdir(mdr); - break; - case MDS_OP_FSYNC: - //handle_client_fsync(req, ref); - break; - - // funky. - case MDS_OP_OPEN: - if ((req->args.open.flags & O_CREAT) && - !mdr->ref) - handle_client_openc(mdr); - else - handle_client_open(mdr); - break; - - // namespace. - // no prior locks. - case MDS_OP_MKNOD: - handle_client_mknod(mdr); - break; - case MDS_OP_LINK: - handle_client_link(mdr); - break; - case MDS_OP_UNLINK: - case MDS_OP_RMDIR: - handle_client_unlink(mdr); - break; - case MDS_OP_RENAME: - handle_client_rename(mdr); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(mdr); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(mdr); - break; - - - default: - dout(1) << " unknown client op " << req->get_op() << endl; - assert(0); - } -} - - -// --------------------------------------- -// SLAVE REQUESTS - -void Server::handle_slave_request(MMDSSlaveRequest *m) -{ - dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << endl; - int from = m->get_source().num(); - - // reply? - if (m->is_reply()) { - - switch (m->get_op()) { - case MMDSSlaveRequest::OP_XLOCKACK: - { - // identify lock, master request - SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), - m->get_object_info()); - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->slaves.insert(from); - dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << endl; - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - lock->get_xlock(mdr); - lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); - } - break; - - case MMDSSlaveRequest::OP_AUTHPINACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_auth_pin_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_LINKPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_link_prep_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_RENAMEPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_prep_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_RENAMEGETINODEACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_get_inode_ack(mdr, m); - } - break; - - default: - assert(0); - } - - // done with reply. - delete m; - return; - - } else { - // am i a new slave? - MDRequest *mdr; - if (mdcache->have_request(m->get_reqid())) { - // existing? - mdr = mdcache->request_get(m->get_reqid()); - if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race) - dout(10) << "local request " << *mdr << " not slave to mds" << from - << ", ignoring " << *m << endl; - delete m; - return; - } - } else { - // new? - if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { - dout(10) << "missing slave request for " << m->get_reqid() - << " OP_FINISH, must have lost race with a forward" << endl; - delete m; - return; - } - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num()); - } - assert(mdr->slave_request == 0); // only one at a time, please! - mdr->slave_request = m; - - dispatch_slave_request(mdr); - } -} - -void Server::dispatch_slave_request(MDRequest *mdr) -{ - dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << endl; - - if (mdr->aborted) { - dout(7) << " abort flag set, finishing" << endl; - mdcache->request_finish(mdr); - return; - } - - switch (mdr->slave_request->get_op()) { - case MMDSSlaveRequest::OP_XLOCK: - { - // identify object - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - - if (lock && lock->get_parent()->is_auth()) { - // xlock. - // use acquire_locks so that we get auth_pinning. - set rdlocks; - set wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // ack - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, mdr->slave_request->get_source().num(), MDS_PORT_SERVER); - } else { - if (lock) { - dout(10) << "not auth for remote xlock attempt, dropping on " - << *lock << " on " << *lock->get_parent() << endl; - } else { - dout(10) << "don't have object, dropping" << endl; - assert(0); // can this happen, if we auth pinned properly. - } - } - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_UNXLOCK: - { - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - assert(lock); - mds->locker->xlock_finish(lock, mdr); - - // done. no ack necessary. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_AUTHPIN: - handle_slave_auth_pin(mdr); - break; - - case MMDSSlaveRequest::OP_LINKPREP: - case MMDSSlaveRequest::OP_UNLINKPREP: - handle_slave_link_prep(mdr); - break; - - case MMDSSlaveRequest::OP_RENAMEPREP: - handle_slave_rename_prep(mdr); - break; - - case MMDSSlaveRequest::OP_RENAMEGETINODE: - handle_slave_rename_get_inode(mdr); - break; - - case MMDSSlaveRequest::OP_FINISH: - // finish off request. - mdcache->request_finish(mdr); - break; - - default: - assert(0); - } -} - - -void Server::handle_slave_auth_pin(MDRequest *mdr) -{ - dout(10) << "handle_slave_auth_pin " << *mdr << endl; - - // build list of objects - list objects; - bool fail = false; - - for (list::iterator p = mdr->slave_request->get_authpins().begin(); - p != mdr->slave_request->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - if (!object) { - dout(10) << " don't have " << *p << endl; - fail = true; - break; - } - - objects.push_back(object); - } - - // can we auth pin them? - if (!fail) { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - if (!(*p)->is_auth()) { - dout(10) << " not auth for " << **p << endl; - fail = true; - break; - } - if (!mdr->is_auth_pinned(*p) && - !(*p)->can_auth_pin()) { - // wait - dout(10) << " waiting for authpinnable on " << **p << endl; - (*p)->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - mdr->drop_local_auth_pins(); - return; - } - } - } - - // auth pin! - if (fail) { - mdr->drop_local_auth_pins(); // just in case - } else { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - dout(10) << "auth_pinning " << **p << endl; - mdr->auth_pin(*p); - } - } - - // ack! - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK); - - // return list of my auth_pins (if any) - for (set::iterator p = mdr->auth_pins.begin(); - p != mdr->auth_pins.end(); - ++p) { - MDSCacheObjectInfo info; - (*p)->set_object_info(info); - reply->get_authpins().push_back(info); - } - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // clean up this request - delete mdr->slave_request; - mdr->slave_request = 0; - return; -} - -void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << endl; - int from = ack->get_source().num(); - - // added auth pins? - set pinned; - for (list::iterator p = ack->get_authpins().begin(); - p != ack->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - assert(object); // we pinned it - dout(10) << " remote has pinned " << *object << endl; - if (!mdr->is_auth_pinned(object)) - mdr->auth_pins.insert(object); - pinned.insert(object); - } - - // removed auth pins? - set::iterator p = mdr->auth_pins.begin(); - while (p != mdr->auth_pins.end()) { - if ((*p)->authority().first == from && - pinned.count(*p) == 0) { - dout(10) << " remote has unpinned " << **p << endl; - set::iterator o = p; - ++p; - mdr->auth_pins.erase(o); - } else { - ++p; - } - } - - // note slave - mdr->slaves.insert(from); - - // clear from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - // go again? - if (mdr->waiting_on_slave.empty()) - dispatch_client_request(mdr); - else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << endl; -} - - -// --------------------------------------- -// HELPERS - - -/** validate_dentry_dir - * - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_dentry_dir: not a dir" << endl; - reply_request(mdr, -ENOTDIR); - return false; - } - - // which dirfrag? - frag_t fg = diri->pick_dirfrag(dname); - - CDir *dir = try_open_auth_dir(diri, fg, mdr); - if (!dir) - return 0; - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << endl; - dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - return dir; -} - - -/** prepare_null_dentry - * prepare a null (or existing) dentry in given dir. - * wait for any dn lock. - */ -CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) -{ - dout(10) << "prepare_null_dentry " << dname << " in " << *dir << endl; - assert(dir->is_auth()); - - // does it already exist? - CDentry *dn = dir->lookup(dname); - if (dn) { - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - if (!dn->is_null()) { - // name already exists - dout(10) << "dentry " << dname << " exists in " << *dir << endl; - if (!okexist) { - reply_request(mdr, -EEXIST); - return 0; - } - } - - return dn; - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // create - dn = dir->add_dentry(dname, 0); - dn->mark_new(); - dout(10) << "prepare_null_dentry added " << *dn << endl; - - return dn; -} - - -/** prepare_new_inode - * - * create a new inode. set c/m/atime. hit dir pop. - */ -CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) -{ - CInode *in = mdcache->create_inode(); - in->inode.uid = mdr->client_request->get_caller_uid(); - in->inode.gid = mdr->client_request->get_caller_gid(); - in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now - dout(10) << "prepare_new_inode " << *in << endl; - - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - - return in; -} - - - -CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) -{ - // figure parent dir vs dname - if (refpath.depth() == 0) { - dout(7) << "can't do that to root" << endl; - reply_request(mdr, -EINVAL); - return 0; - } - string dname = refpath.last_dentry(); - refpath.pop_dentry(); - - dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << endl; - - // traverse to parent dir - int r = mdcache->path_traverse(mdr, mdr->client_request, - 0, refpath, trace, true, - MDS_TRAVERSE_FORWARD); - if (r > 0) return 0; // delayed - if (r < 0) { - reply_request(mdr, r); - return 0; - } - - // open inode - CInode *diri; - if (trace.empty()) - diri = mdcache->get_root(); - else - diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); - if (!diri) - return 0; // opening inode. - - // is it an auth dir? - CDir *dir = validate_dentry_dir(mdr, diri, dname); - if (!dir) - return 0; // forwarded or waiting for freeze - - dout(10) << "traverse_to_auth_dir " << *dir << endl; - return dir; -} - - - -CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) -{ - // already got ref? - if (mdr->ref) - return mdr->ref; - - MClientRequest *req = mdr->client_request; - - // traverse - filepath refpath = req->get_filepath(); - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, refpath, - trace, req->follow_trailing_symlink(), - MDS_TRAVERSE_FORWARD); - if (r > 0) return false; // delayed - if (r < 0) { // error - reply_request(mdr, r); - return 0; - } - - // open ref inode - CInode *ref = 0; - if (trace.empty()) - ref = mdcache->get_root(); - else { - CDentry *dn = trace[trace.size()-1]; - - // if no inode (null or unattached remote), fw to dentry auth? - if (want_auth && !dn->is_auth() && - (dn->is_null() || - (dn->is_remote() && dn->inode))) { - if (dn->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *dn << endl; - dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *dn << endl; - mdcache->request_forward(mdr, dn->authority().first); - return 0; - } - } - - // open ref inode - ref = mdcache->get_dentry_inode(dn, mdr); - if (!ref) return 0; - } - dout(10) << "ref is " << *ref << endl; - - // fw to inode auth? - if (want_auth && !ref->is_auth()) { - if (ref->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *ref << endl; - ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *ref << endl; - mdcache->request_forward(mdr, ref->authority().first); - } - return 0; - } - - // auth_pin? - if (want_auth) { - if (ref->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *ref << endl; - ref->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - mdr->auth_pin(ref); - } - - // lock the path - set rdlocks, empty; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) - return 0; - - // set and pin ref - mdr->pin(ref); - mdr->ref = ref; - - // save the locked trace. - mdr->trace.swap(trace); - - return ref; -} - - -/** rdlock_path_xlock_dentry - * traverse path to the directory that could/would contain dentry. - * make sure i am auth for that dentry, forward as necessary. - * create null dentry in place (or use existing if okexist). - * get rdlocks on traversed dentries, xlock on new dentry. - */ -CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) -{ - MClientRequest *req = mdr->client_request; - - vector trace; - CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); - if (!dir) return 0; - dout(10) << "rdlock_path_xlock_dentry dir " << *dir << endl; - - // make sure we can auth_pin (or have already authpinned) dir - if (dir->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *dir << endl; - dir->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // make a null dentry? - const string &dname = req->get_filepath().last_dentry(); - CDentry *dn; - if (mustexist) { - dn = dir->lookup(dname); - - // make sure dir is complete - if (!dn && !dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // readable? - if (dn && dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // exists? - if (!dn || dn->is_null()) { - dout(7) << "dentry " << dname << " dne in " << *dir << endl; - reply_request(mdr, -ENOENT); - return 0; - } - } else { - dn = prepare_null_dentry(mdr, dir, dname, okexist); - if (!dn) - return 0; - } - - // -- lock -- - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - if (dn->is_null()) { - xlocks.insert(&dn->lock); // new dn, xlock - wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime - } else - rdlocks.insert(&dn->lock); // existing dn, rdlock - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return 0; - - // save the locked trace. - mdr->trace.swap(trace); - - return dn; -} - - - - - -CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - - // not open and inode not mine? - if (!dir && !diri->is_auth()) { - int inauth = diri->authority().first; - dout(7) << "try_open_auth_dir: not open, not inode auth, fw to mds" << inauth << endl; - mdcache->request_forward(mdr, inauth); - return 0; - } - - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_auth_dir: dir inode is frozen, waiting " << *diri << endl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) { - assert(diri->is_auth()); - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - } - assert(dir); - - // am i auth for the dirfrag? - if (!dir->is_auth()) { - int auth = dir->authority().first; - dout(7) << "try_open_auth_dir: not auth for " << *dir - << ", fw to mds" << auth << endl; - mdcache->request_forward(mdr, auth); - return 0; - } - - return dir; -} - -/* -CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - if (dir) - return dir; - - if (diri->is_auth()) { - // auth - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_dir: dir inode is auth+frozen, waiting " << *diri << endl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) { - assert(diri->is_auth()); - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - } - assert(dir); - return dir; - } else { - // not auth - mdcache->open_remote_dir(diri, fg, - new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } -} -*/ - - -/** predirty_dn_diri - * predirty the directory inode for a new dentry, if it is auth (and not root) - * BUG: root inode doesn't get dirtied properly, currently. blech. - */ -version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) -{ - version_t dirpv = 0; - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return 0; - - if (diri->is_auth()) { - assert(mdr->wrlocks.count(&diri->dirlock)); - - dirpv = diri->pre_dirty(); - dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << endl; - - // predirty+journal - inode_t *pi = diri->project_inode(); - if (dirpv) pi->version = dirpv; - pi->ctime = pi->mtime = mdr->now; - blob->add_dir_context(diri->get_parent_dir()); - blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - } else { - // journal the mtime change anyway. - inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); - ji->ctime = ji->mtime = mdr->now; - - dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << endl; - - blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); - } - - return dirpv; -} - -/** dirty_dn_diri - * follow-up with actual dirty of inode after journal entry commits. - */ -void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) -{ - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return; - - if (dirpv) { - // we journaled and predirtied. - assert(diri->is_auth() && !diri->is_root()); - diri->pop_and_dirty_projected_inode(); - dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl; - } else { - // dirlock scatterlock will propagate the update. - diri->inode.ctime = diri->inode.mtime = mtime; - diri->dirlock.set_updated(); - dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mtime << " on " << *diri << endl; - } -} - - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *ref = rdlock_path_pin_ref(mdr, false); - if (!ref) return; - - // which inode locks do I want? - /* note: this works because we include existing locks in our lists, - and because all new locks are on inodes and sort to the right of - the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - - int mask = req->args.stat.mask; - if (mask & INODE_MASK_LINK) rdlocks.insert(&ref->linklock); - if (mask & INODE_MASK_AUTH) rdlocks.insert(&ref->authlock); - if (ref->is_file() && - mask & INODE_MASK_FILE) rdlocks.insert(&ref->filelock); - if (ref->is_dir() && - mask & INODE_MASK_MTIME) rdlocks.insert(&ref->dirlock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // reply - dout(10) << "reply to stat on " << *req << endl; - MClientReply *reply = new MClientReply(req); - reply_request(mdr, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher for basic inode updates - */ -class C_MDS_inode_update_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; -public: - C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : - mds(m), mdr(r), in(i) { } - void finish(int r) { - assert(r == 0); - - // apply - in->pop_and_dirty_projected_inode(); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // project update - inode_t *pi = cur->project_inode(); - pi->mtime = req->args.utime.mtime; - pi->atime = req->args.utime.atime; - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - EUpdate *le = new EUpdate("utime"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chmod - -void Server::handle_client_chmod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // project update - inode_t *pi = cur->project_inode(); - pi->mode = req->args.chmod.mode & 04777; - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - EUpdate *le = new EUpdate("chmod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chown - -void Server::handle_client_chown(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // project update - inode_t *pi = cur->project_inode(); - pi->uid = MAX(req->args.chown.uid, 0); - pi->gid = MAX(req->args.chown.gid, 0); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - EUpdate *le = new EUpdate("chown"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -int Server::encode_dir_contents(CDir *dir, - list& inls, - list& dnls) -{ - int numfiles = 0; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - if (dn->is_null()) continue; - - CInode *in = dn->inode; - if (!in) - continue; // hmm, fixme!, what about REMOTE links? - - dout(12) << "including inode " << *in << endl; - - // add this item - // note: InodeStat makes note of whether inode data is readable. - dnls.push_back( it->first ); - inls.push_back( new InodeStat(in, mds->get_nodeid()) ); - numfiles++; - } - return numfiles; -} - - -void Server::handle_client_readdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *diri = rdlock_path_pin_ref(mdr, false); - if (!diri) return; - - // it's a directory, right? - if (!diri->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(mdr, -ENOTDIR); - return; - } - - // which frag? - frag_t fg = req->args.readdir.frag; - - // does it exist? - if (diri->dirfragtree[fg] != fg) { - dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; - reply_request(mdr, -EAGAIN); - return; - } - - CDir *dir = try_open_auth_dir(diri, fg, mdr); - if (!dir) return; - - // ok! - assert(dir->is_auth()); - - // check perm - /* - if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) - return; - mds->locker->inode_hard_rdlock_finish(diri, mdr); - */ - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // build dir contents - list inls; - list dnls; - int numfiles = encode_dir_contents(dir, inls, dnls); - - // . too - dnls.push_back("."); - inls.push_back(new InodeStat(diri, mds->get_nodeid())); - ++numfiles; - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(fg); - - //balancer->hit_dir(diri->dir); - - // reply - reply_request(mdr, reply, diri); -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t dirpv; -public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) : - mds(m), mdr(r), dn(d), newi(ni), - dirpv(dirpv_) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(newi->inode.version + 1); - - // dir inode's mtime - mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, newi); - } -}; - - -void Server::handle_client_mknod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a file. - newi->inode.mode = req->args.mknod.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_FILE; - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - EUpdate *le = new EUpdate("mknod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - -// MKDIR - -void Server::handle_client_mkdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - // new inode - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a directory. - newi->inode.mode = req->args.mkdir.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - newi->inode.layout = g_OSD_MDDirLayout; - newi->inode.version = dn->pre_dirty() - 1; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); - newdir->mark_complete(); - newdir->mark_dirty(newdir->pre_dirty()); - - // prepare finisher - EUpdate *le = new EUpdate("mkdir"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - le->metablob.add_dir(newdir, true, true); // dirty AND complete - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); - - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - -// SYMLINK - -void Server::handle_client_symlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - newi->symlink = req->get_sarg(); - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - EUpdate *le = new EUpdate("symlink"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - - - -// LINK - -void Server::handle_client_link(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "handle_client_link " << req->get_filepath() - << " to " << req->get_sarg() - << endl; - - // traverse to dest dir, make sure it's ours. - const filepath &linkpath = req->get_filepath(); - const string &dname = linkpath.last_dentry(); - vector linktrace; - CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); - if (!dir) return; - dout(7) << "handle_client_link link " << dname << " in " << *dir << endl; - - // traverse to link target - filepath targetpath = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << targetpath << endl; - vector targettrace; - int r = mdcache->path_traverse(mdr, req, - 0, targetpath, targettrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (targettrace.empty()) r = -EINVAL; - if (r < 0) { - reply_request(mdr, r); - return; - } - - // identify target inode - CInode *targeti = targettrace[targettrace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << endl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing..." << endl; - reply_request(mdr, -EINVAL); - return; - } - - // get/make null link dentry - CDentry *dn = prepare_null_dentry(mdr, dir, dname, false); - if (!dn) return; - - // create lock lists - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)linktrace.size(); i++) - rdlocks.insert(&linktrace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - for (int i=0; i<(int)targettrace.size(); i++) - rdlocks.insert(&targettrace[i]->lock); - xlocks.insert(&targeti->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mdr->done_locking = true; // avoid wrlock moving target issues. - - // pick mtime - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // does the target need an anchor? - if (targeti->is_auth()) { - /*if (targeti->get_parent_dir() == dn->dir) { - dout(7) << "target is in the same dirfrag, sweet" << endl; - } - else - */ - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // go! - - // local or remote? - if (targeti->is_auth()) - _link_local(mdr, dn, targeti); - else - _link_remote(mdr, dn, targeti); -} - - -class C_MDS_link_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dnpv; - version_t tipv; - version_t dirpv; -public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, - version_t dnpv_, version_t tipv_, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); - } -}; - - -void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_local " << *dn << " to " << *targeti << endl; - - // predirty NEW dentry - version_t dnpv = dn->pre_dirty(); - version_t tipv = targeti->pre_dirty(); - - // project inode update - inode_t *pi = targeti->project_inode(); - pi->nlink++; - pi->ctime = mdr->now; - pi->version = tipv; - - // log + wait - EUpdate *le = new EUpdate("link_local"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); -} - -void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dnpv, version_t tipv, version_t dirpv) -{ - dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; - - // link and unlock the NEW dentry - dn->dir->link_inode(dn, targeti->ino()); - dn->mark_dirty(dnpv); - - // target inode - targeti->pop_and_dirty_projected_inode(); - - // new dentry dir mtime - dirty_dn_diri(dn, dirpv, mdr->now); - - // bump target popularity - mds->balancer->hit_inode(targeti, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote - -class C_MDS_link_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dpv; - version_t dirpv; -public: - C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dpv(d->get_projected_version()), - dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv); - } -}; - -void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_remote " << *dn << " to " << *targeti << endl; - - // 1. send LinkPrepare to dest (journal nlink++ prepare) - int linkauth = targeti->authority().first; - if (mdr->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++" << endl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); - targeti->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, linkauth, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(linkauth) == 0); - mdr->waiting_on_slave.insert(linkauth); - return; - } - dout(10) << " targeti auth has prepared nlink++" << endl; - - // go. - // predirty dentry - dn->pre_dirty(); - - // add to event - EUpdate *le = new EUpdate("link_remote"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); -} - -void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, version_t dirpv) -{ - dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << endl; - - // link the new dentry - dn->dir->link_inode(dn, targeti->ino()); - dn->mark_dirty(dpv); - - // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); - - // bump target popularity - mds->balancer->hit_inode(targeti, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote linking/unlinking - -class C_MDS_SlaveLinkPrep : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - bool inc; -public: - C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } - void finish(int r) { - assert(r == 0); - server->_logged_slave_link(mdr, targeti, old_ctime, inc); - } -}; - -void Server::handle_slave_link_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_link_prep " << *mdr - << " on " << mdr->slave_request->get_object_info() - << endl; - - CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); - assert(targeti); - dout(10) << "targeti " << *targeti << endl; - CDentry *dn = targeti->get_parent_dn(); - assert(dn->is_primary()); - - mdr->now = mdr->slave_request->now; - - // anchor? - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - - inode_t *pi = dn->inode->project_inode(); - - // update journaled target inode - bool inc; - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - inc = true; - pi->nlink++; - } else { - inc = false; - pi->nlink--; - } - utime_t old_ctime = pi->ctime; - pi->ctime = mdr->now; - pi->version = targeti->pre_dirty(); - - dout(10) << " projected inode " << pi << " v " << pi->version << endl; - - // journal it - ESlaveUpdate *le = new ESlaveUpdate("slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary - mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); -} - -class C_MDS_SlaveLinkCommit : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - version_t old_version; - bool inc; -public: - C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { } - void finish(int r) { - server->_commit_slave_link(mdr, r, targeti, - old_ctime, old_version, inc); - } -}; - -void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) -{ - dout(10) << "_logged_slave_link " << *mdr - << " inc=" << inc - << " " << *targeti << endl; - - version_t old_version = targeti->inode.version; - - // update the target - targeti->pop_and_dirty_projected_inode(); - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - - -void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc) -{ - dout(10) << "_commit_slave_link " << *mdr - << " r=" << r - << " inc=" << inc - << " " << *targeti << endl; - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate("slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - le = new ESlaveUpdate("slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - assert(targeti->inode.ctime == mdr->now); - assert(targeti->projected_inode.empty()); // we're holding the version lock. - - targeti->inode.ctime = old_ctime; - targeti->inode.version = old_version; - if (inc) - targeti->inode.nlink++; - else - targeti->inode.nlink--; - } - - mds->mdlog->submit_entry(le); -} - - - -void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_link_prep_ack " << *mdr - << " " << *m << endl; - int from = m->get_source().num(); - - // note slave - mdr->slaves.insert(from); - - // witnessed! - assert(mdr->witnessed.count(from) == 0); - mdr->witnessed.insert(from); - - // remove from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - assert(mdr->waiting_on_slave.empty()); - - dispatch_client_request(mdr); // go again! -} - - - - - -// UNLINK - -void Server::handle_client_unlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - // traverse to path - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, req->get_filepath(), trace, false, - MDS_TRAVERSE_FORWARD); - if (r > 0) return; - if (trace.empty()) r = -EINVAL; // can't unlink root - if (r < 0) { - reply_request(mdr, r); - return; - } - - CDentry *dn = trace[trace.size()-1]; - assert(dn); - - // is it my dentry? - if (!dn->is_auth()) { - // fw to auth - mdcache->request_forward(mdr, dn->authority().first); - return; - } - - // rmdir or unlink? - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - if (rmdir) { - dout(7) << "handle_client_rmdir on " << *dn << endl; - } else { - dout(7) << "handle_client_unlink on " << *dn << endl; - } - - // readable? - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // dn looks ok. - - // get/open inode. - mdr->trace.swap(trace); - CInode *in = mdcache->get_dentry_inode(dn, mdr); - if (!in) return; - dout(7) << "dn links to " << *in << endl; - - // rmdir vs is_dir - if (in->is_dir()) { - if (rmdir) { - // do empty directory checks - if (!_verify_rmdir(mdr, in)) - return; - } else { - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; - reply_request(mdr, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; - reply_request(mdr, -ENOTDIR); - return; - } - } - - // lock - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size()-1; i++) - rdlocks.insert(&trace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - xlocks.insert(&in->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // yay! - mdr->done_locking = true; // avoid wrlock racing - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // get stray dn ready? - CDentry *straydn = 0; - if (dn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(dn->inode); - dout(10) << " straydn is " << *straydn << endl; - - if (!mdr->dst_reanchor_atid && - dn->inode->is_anchored()) { - dout(10) << "reanchoring to stray " << *dn->inode << endl; - vector trace; - straydn->make_anchor_trace(trace, dn->inode); - mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->dst_reanchor_atid, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // ok! - if (dn->is_remote() && !dn->inode->is_auth()) - _unlink_remote(mdr, dn); - else - _unlink_local(mdr, dn, straydn); -} - - - -class C_MDS_unlink_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CDentry *straydn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t dirpv_) : - mds(m), mdr(r), dn(d), straydn(sd), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); - } -}; - - -void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) -{ - dout(10) << "_unlink_local " << *dn << endl; - - // ok, let's do it. - // prepare log entry - EUpdate *le = new EUpdate("unlink_local"); - le->metablob.add_client_req(mdr->reqid); - - version_t ipv = 0; // dirty inode version - inode_t *ji = 0; // journaled projected inode - if (dn->is_primary()) { - // primary link. add stray dentry. - assert(straydn); - ipv = straydn->pre_dirty(dn->inode->inode.version); - le->metablob.add_dir_context(straydn->dir); - ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); - } else { - // remote link. update remote inode. - ipv = dn->inode->pre_dirty(); - le->metablob.add_dir_context(dn->inode->get_parent_dir()); - ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); - } - - // update journaled target inode - inode_t *pi = dn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); - - // finisher - C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv); - - journal_opens(); // journal pending opens, just in case - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); -} - -void Server::_unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_local_finish " << *dn << endl; - - // unlink main dentry - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - - // relink as stray? (i.e. was primary link?) - if (straydn) straydn->dir->link_inode(straydn, in); - - // nlink--, dirty old dentry - in->pop_and_dirty_projected_inode(); - dn->mark_dirty(dnpv); - - // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); - - // bump target popularity - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << endl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - if (straydn) { - unlink->strayin = straydn->dir->inode->replicate_to(it->first); - unlink->straydir = straydn->dir->replicate_to(it->first); - unlink->straydn = straydn->replicate_to(it->first); - } - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->dst_reanchor_atid) - mds->anchorclient->commit(mdr->dst_reanchor_atid); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - -class C_MDS_unlink_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d, - version_t dirpv_) : - mds(m), mdr(r), dn(d), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv); - } -}; - -void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) -{ - dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << endl; - - // 1. send LinkPrepare to dest (journal nlink-- prepare) - int inauth = dn->inode->authority().first; - if (mdr->witnessed.count(inauth) == 0) { - dout(10) << " inode auth must prepare nlink--" << endl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); - dn->inode->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, inauth, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(inauth) == 0); - mdr->waiting_on_slave.insert(inauth); - return; - } - dout(10) << " inode auth has prepared nlink--" << endl; - - // ok, let's do it. - // prepare log entry - EUpdate *le = new EUpdate("unlink_remote"); - le->metablob.add_client_req(mdr->reqid); - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); - - // finisher - C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); -} - -void Server::_unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_remote_finish " << *dn << endl; - - // unlink main dentry - dn->dir->unlink_inode(dn); - dn->mark_dirty(dnpv); // dirty old dentry - - // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); - - // bump target popularity - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << endl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->dst_reanchor_atid) - mds->anchorclient->commit(mdr->dst_reanchor_atid); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - - - -/** _verify_rmdir - * - * verify that a directory is empty (i.e. we can rmdir it), - * and make sure it is part of the same subtree (i.e. local) - * so that rmdir will occur locally. - * - * @param in is the inode being rmdir'd. - */ -bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) -{ - dout(10) << "_verify_rmdir " << *in << endl; - assert(in->is_auth()); - - list frags; - in->dirfragtree.get_leaves(frags); - - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = in->get_dirfrag(*p); - if (!dir) - dir = in->get_or_open_dirfrag(mdcache, *p); - assert(dir); - - // dir looks empty but incomplete? - if (dir->is_auth() && - dir->get_size() == 0 && - !dir->is_complete()) { - dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << endl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // does the frag _look_ empty? - if (dir->get_size()) { - dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << endl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - - // not dir auth? - if (!dir->is_auth()) { - dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << endl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - } - - return true; -} -/* - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; - mdcache->migrator->export_empty_import(in->dir); - in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); - return; - } -*/ - - - - -// ====================================================== - - -class C_MDS_rename_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *srcdn; - CDentry *destdn; - CDentry *straydn; -public: - C_MDS_rename_finish(MDS *m, MDRequest *r, - CDentry *sdn, CDentry *ddn, CDentry *stdn) : - mds(m), mdr(r), - srcdn(sdn), destdn(ddn), straydn(stdn) { } - void finish(int r) { - assert(r == 0); - mds->server->_rename_finish(mdr, srcdn, destdn, straydn); - } -}; - - -/** handle_client_rename - * - */ -void Server::handle_client_rename(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - dout(7) << "handle_client_rename " << *req << endl; - - // traverse to dest dir (not dest) - // we do this FIRST, because the rename should occur on the - // destdn's auth. - const filepath &destpath = req->get_sarg(); - const string &destname = destpath.last_dentry(); - vector desttrace; - CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); - if (!destdir) return; // fw or error out - dout(10) << "dest will be " << destname << " in " << *destdir << endl; - assert(destdir->is_auth()); - - // traverse to src - filepath srcpath = req->get_filepath(); - vector srctrace; - int r = mdcache->path_traverse(mdr, req, - 0, srcpath, srctrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - if (srctrace.empty()) r = -EINVAL; // can't rename root - if (r < 0) { - reply_request(mdr, r); - return; - } - CDentry *srcdn = srctrace[srctrace.size()-1]; - dout(10) << " srcdn " << *srcdn << endl; - CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); - dout(10) << " srci " << *srci << endl; - - // -- some sanity checks -- - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, noop" << endl; - reply_request(mdr, 0); - return; - } - - // dest a child of src? - // e.g. mv /usr /usr/foo - CDentry *pdn = destdir->inode->parent; - while (pdn) { - if (pdn == srcdn) { - dout(7) << "cannot rename item to be a child of itself" << endl; - reply_request(mdr, -EINVAL); - return; - } - pdn = pdn->dir->inode->parent; - } - - - // identify/create dest dentry - CDentry *destdn = destdir->lookup(destname); - if (destdn && destdn->lock.is_xlocked_by_other(mdr)) { - destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - CInode *oldin = 0; - if (destdn && !destdn->is_null()) { - //dout(10) << "dest dn exists " << *destdn << endl; - oldin = mdcache->get_dentry_inode(destdn, mdr); - if (!oldin) return; - dout(10) << " oldin " << *oldin << endl; - - // mv /some/thing /to/some/existing_other_thing - if (oldin->is_dir() && !srci->is_dir()) { - reply_request(mdr, -EISDIR); - return; - } - if (!oldin->is_dir() && srci->is_dir()) { - reply_request(mdr, -ENOTDIR); - return; - } - - // non-empty dir? - if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) - return; - } - if (!destdn) { - // mv /some/thing /to/some/non_existent_name - destdn = prepare_null_dentry(mdr, destdir, destname); - if (!destdn) return; - } - - dout(10) << " destdn " << *destdn << endl; - - - // -- locks -- - set rdlocks, wrlocks, xlocks; - - // rdlock sourcedir path, xlock src dentry - for (int i=0; i<(int)srctrace.size()-1; i++) - rdlocks.insert(&srctrace[i]->lock); - xlocks.insert(&srcdn->lock); - wrlocks.insert(&srcdn->dir->inode->dirlock); - - // rdlock destdir path, xlock dest dentry - for (int i=0; i<(int)desttrace.size(); i++) - rdlocks.insert(&desttrace[i]->lock); - xlocks.insert(&destdn->lock); - wrlocks.insert(&destdn->dir->inode->dirlock); - - // xlock oldin (for nlink--) - if (oldin) xlocks.insert(&oldin->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // set done_locking flag, to avoid problems with wrlock moving auth target - mdr->done_locking = true; - - // -- open all srcdn inode frags, if any -- - // we need these open so that auth can properly delegate from inode to dirfrags - // after the inode is _ours_. - if (srcdn->is_primary() && - !srcdn->is_auth() && - srci->is_dir()) { - dout(10) << "srci is remote dir, opening all frags" << endl; - list frags; - srci->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = srci->get_dirfrag(*p); - if (dir) { - dout(10) << " opened " << *dir << endl; - mdr->pin(dir); - } else { - mdcache->open_remote_dir(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - } - - // -- declare now -- - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // -- create stray dentry? -- - CDentry *straydn = 0; - if (destdn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(destdn->inode); - dout(10) << "straydn is " << *straydn << endl; - } - - // -- prepare witnesses -- - set witnesses = mdr->extra_witnesses; - if (srcdn->is_auth()) - srcdn->list_replicas(witnesses); - else - witnesses.insert(srcdn->authority().first); - destdn->list_replicas(witnesses); - - for (set::iterator p = witnesses.begin(); - p != witnesses.end(); - ++p) { - if (mdr->witnessed.count(*p)) { - dout(10) << " already witnessed by mds" << *p << endl; - } else { - dout(10) << " not yet witnessed by mds" << *p << ", sending prepare" << endl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(*p); - CDirDiscover *dirdis = straydn->dir->replicate_to(*p); - CDentryDiscover *dndis = straydn->replicate_to(*p); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - mds->send_message_mds(req, *p, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(*p) == 0); - mdr->waiting_on_slave.insert(*p); - } - } - if (!mdr->waiting_on_slave.empty()) - return; // we're waiting for a witness. - - // -- inode migration? -- - if (!srcdn->is_auth() && - srcdn->is_primary()) { - if (mdr->inode_import.length() == 0) { - // get inode - int auth = srcdn->authority().first; - dout(10) << " requesting inode export from srcdn auth mds" << auth << endl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODE); - srcdn->make_path(req->srcdnpath); - mds->send_message_mds(req, auth, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(auth) == 0); - mdr->waiting_on_slave.insert(auth); - return; - } else { - dout(10) << " already (just!) got inode export from srcdn auth" << endl; - /*int off = 0; - mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off, - srcdn->authority().first); - srcdn->inode->force_auth.first = srcdn->authority().first; - */ - } - } - - // -- prepare anchor updates -- - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (!linkmerge) { - C_Gather *anchorgather = 0; - - if (srcdn->is_primary() && srcdn->inode->is_anchored() && - srcdn->dir != destdn->dir && - !mdr->src_reanchor_atid) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->src_reanchor_atid, - anchorgather->new_sub()); - } - if (destdn->is_primary() && - destdn->inode->is_anchored() && - !mdr->dst_reanchor_atid) { - dout(10) << "reanchoring dst->stray " << *destdn->inode << endl; - - assert(straydn); - vector trace; - straydn->make_anchor_trace(trace, destdn->inode); - - if (!anchorgather) - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->dst_reanchor_atid, - anchorgather->new_sub()); - } - - if (anchorgather) - return; // waiting for anchor prepares - } - - // -- prepare journal entry -- - EUpdate *le = new EUpdate("rename"); - le->metablob.add_client_req(mdr->reqid); - - _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); - - // -- commit locally -- - C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_finish " << *mdr << endl; - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - // commit anchor updates? - if (mdr->src_reanchor_atid) mds->anchorclient->commit(mdr->src_reanchor_atid); - if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); -} - - - -// helpers - -void Server::_rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << endl; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (mdr->is_master()) { - mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); - if (destdn->dir != srcdn->dir) - mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); - } - - inode_t *ji = 0; // journaled inode getting nlink-- - version_t ipv; // it's version - - if (linkmerge) { - dout(10) << "will merge remote+primary links" << endl; - - // destdn -> primary - metablob->add_dir_context(destdn->dir); - if (destdn->is_auth()) - ipv = mdr->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(destdn, true, destdn->inode); - - // do src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - } else { - // move to stray? - if (destdn->is_primary()) { - // primary. we'll move inode to stray dir. - assert(straydn); - - // link-- inode, move to stray dir. - metablob->add_dir_context(straydn->dir); - if (straydn->is_auth()) - ipv = mdr->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(straydn, true, destdn->inode); - } - else if (destdn->is_remote()) { - // remote. - // nlink-- targeti - metablob->add_dir_context(destdn->inode->get_parent_dir()); - if (destdn->inode->is_auth()) - ipv = mdr->pvmap[destdn->inode] = destdn->inode->pre_dirty(); - ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary - dout(10) << "remote targeti (nlink--) is " << *destdn->inode << endl; - } - else { - assert(destdn->is_null()); - } - - // add dest dentry - metablob->add_dir_context(destdn->dir); - if (srcdn->is_primary()) { - dout(10) << "src is a primary dentry" << endl; - if (destdn->is_auth()) { - version_t siv; - if (srcdn->is_auth()) - siv = srcdn->inode->get_projected_version(); - else - siv = mdr->inode_import_v; - mdr->pvmap[destdn] = destdn->pre_dirty(siv+1); - } - metablob->add_primary_dentry(destdn, true, srcdn->inode); - - } else { - assert(srcdn->is_remote()); - dout(10) << "src is a remote dentry" << endl; - if (destdn->is_auth()) - mdr->pvmap[destdn] = destdn->pre_dirty(); - metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); - } - - // remove src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - // new subtree? - if (srcdn->is_primary() && - srcdn->inode->is_dir()) { - list ls; - srcdn->inode->get_nested_dirfrags(ls); - int auth = srcdn->authority().first; - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - mdcache->adjust_subtree_auth(*p, auth, auth); - } - } - - if (ji) { - // update journaled target inode - inode_t *pi = destdn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - } - - // anchor updates? - if (mdr->src_reanchor_atid) - metablob->add_anchor_transaction(mdr->src_reanchor_atid); - if (mdr->dst_reanchor_atid) - metablob->add_anchor_transaction(mdr->dst_reanchor_atid); -} - - -void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << endl; - dout(10) << " pvs " << mdr->pvmap << endl; - - CInode *oldin = destdn->inode; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - // dir mtimes - if (mdr->is_master()) { - dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now); - if (destdn->dir != srcdn->dir) - dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now); - } - - if (linkmerge) { - if (destdn->is_primary()) { - dout(10) << "merging remote onto primary link" << endl; - - // nlink-- in place - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); - - // unlink srcdn - srcdn->dir->unlink_inode(srcdn); - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); - } else { - dout(10) << "merging primary onto remote link" << endl; - assert(srcdn->is_primary()); - - // move inode to dest - srcdn->dir->unlink_inode(srcdn); - destdn->dir->unlink_inode(destdn); - destdn->dir->link_inode(destdn, oldin); - - // nlink-- - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); - - // mark src dirty - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); - } - } - else { - // unlink destdn? - if (!destdn->is_null()) - destdn->dir->unlink_inode(destdn); - - if (straydn) { - dout(10) << "straydn is " << *straydn << endl; - - // relink oldin to stray dir. destdn was primary. - assert(oldin); - straydn->dir->link_inode(straydn, oldin); - //assert(straypv == ipv); - - // nlink-- in stray dir. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->mark_dirty(mdr->pvmap[straydn]); - } - else if (oldin) { - // nlink-- remote. destdn was remote. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->mark_dirty(mdr->pvmap[oldin]); - } - - CInode *in = srcdn->inode; - assert(in); - if (srcdn->is_remote()) { - // srcdn was remote. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_inode(destdn, in->ino()); - if (destdn->is_auth()) - destdn->mark_dirty(mdr->pvmap[destdn]); - } else { - // srcdn was primary. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_inode(destdn, in); - - // srcdn inode import? - if (!srcdn->is_auth() && destdn->is_auth()) { - assert(mdr->inode_import.length() > 0); - int off = 0; - map imported_client_map; - ::_decode(imported_client_map, mdr->inode_import, off); - mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off, - srcdn->authority().first, - imported_client_map); - } - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); - } - - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); - } - - // update subtree map? - if (destdn->is_primary() && destdn->inode->is_dir()) - mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); - - // removing a new dn? - srcdn->dir->try_remove_unlinked_dn(srcdn); -} - - - - - -// ------------ -// SLAVE - -class C_MDS_SlaveRenamePrep : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_logged_slave_rename(mdr, srcdn, destdn, straydn); - } -}; - -class C_MDS_SlaveRenameCommit : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); - } -}; - -void Server::handle_slave_rename_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_prep " << *mdr - << " " << mdr->slave_request->srcdnpath - << " to " << mdr->slave_request->destdnpath - << endl; - - // discover destdn - filepath destpath(mdr->slave_request->destdnpath); - dout(10) << " dest " << destpath << endl; - vector trace; - int r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, destpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *destdn = trace[trace.size()-1]; - dout(10) << " destdn " << *destdn << endl; - mdr->pin(destdn); - - - // discover srcdn - filepath srcpath(mdr->slave_request->srcdnpath); - dout(10) << " src " << srcpath << endl; - r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, srcpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *srcdn = trace[trace.size()-1]; - dout(10) << " srcdn " << *srcdn << endl; - mdr->pin(srcdn); - assert(srcdn->inode); - mdr->pin(srcdn->inode); - - // stray? - CDentry *straydn = 0; - if (destdn->is_primary()) { - assert(mdr->slave_request->stray.length() > 0); - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, - destdn->inode, mdr->slave_to_mds); - assert(straydn); - mdr->pin(straydn); - } - - mdr->now = mdr->slave_request->now; - - // journal it? - if (srcdn->is_auth() || - destdn->inode->is_auth() || - srcdn->inode->is_any_caps()) { - // journal. - ESlaveUpdate *le = new ESlaveUpdate("slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); - mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); - } else { - // don't journal. - dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << endl; - _logged_slave_rename(mdr, srcdn, destdn, straydn); - } -} - -void Server::_logged_slave_rename(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_logged_slave_rename " << *mdr << endl; - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - if (srcdn->is_auth()) { - // share the replica list, so that they can all witness the rename. - srcdn->list_replicas(reply->srcdn_replicas); - - // note srcdn, we'll get asked for inode momentarily - mdr->srcdn = srcdn; - } - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::_commit_slave_rename(MDRequest *mdr, int r, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << endl; - - ESlaveUpdate *le; - if (r == 0) { - // commit - _rename_apply(mdr, srcdn, destdn, straydn); - - // write a commit to the journal - le = new ESlaveUpdate("slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - // abort - le = new ESlaveUpdate("slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - } - mds->mdlog->submit_entry(le); -} - -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << m->get_source() - << " " << *m << endl; - int from = m->get_source().num(); - - // note slave - mdr->slaves.insert(from); - - // witnessed! - assert(mdr->witnessed.count(from) == 0); - mdr->witnessed.insert(from); - - - // add extra witnesses? - if (!m->srcdn_replicas.empty()) { - dout(10) << " extra witnesses (srcdn replicas) are " << m->srcdn_replicas << endl; - mdr->extra_witnesses = m->srcdn_replicas; - mdr->extra_witnesses.erase(mds->get_nodeid()); // not me! - } - - // remove from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - if (mdr->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << endl; -} - - - -void Server::handle_slave_rename_get_inode(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_get_inode " << *mdr << endl; - - assert(mdr->srcdn); - assert(mdr->srcdn->is_auth()); - assert(mdr->srcdn->is_primary()); - - // reply - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODEACK); - dout(10) << " replying with inode export info " << *mdr->srcdn->inode << endl; - - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, mdr->slave_to_mds, - exported_client_map); - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - - reply->inode_export_v = mdr->srcdn->inode->inode.version; - - mdr->inode_import = reply->inode_export; // keep a copy locally, in case we have to rollback - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // clean up. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_rename_get_inode_ack " << *mdr - << " " << *m << endl; - int from = m->get_source().num(); - - assert(m->inode_export.length()); - dout(10) << " got inode export, saving in " << *mdr << endl; - mdr->inode_import.claim(m->inode_export); - mdr->inode_import_v = m->inode_export_v; - - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - if (mdr->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << endl; -} - - - - - - -// =================================== -// TRUNCATE, FSYNC - -class C_MDS_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = size; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv); - - // hit pop - mds->balancer->hit_inode(in, META_POP_IWR); - - // reply - mds->server->reply_request(mdr, 0); - } -}; - -class C_MDS_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // purge - mds->mdcache->purge_inode(&in->inode, size); - mds->mdcache->wait_for_purge(in->inode.ino, size, - new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); - } -}; - -void Server::handle_client_truncate(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // check permissions? - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // already small enough? - if (cur->inode.size >= req->args.truncate.length) { - reply_request(mdr, 0); - return; - } - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, - pdv, req->args.truncate.length, ctime); - - // log + wait - EUpdate *le = new EUpdate("truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = req->args.truncate.length; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - int flags = req->args.open.flags; - int cmode = req->get_open_file_mode(); - bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || - (flags & O_TRUNC)); - dout(10) << "open flags = " << flags - << ", filemode = " << cmode - << ", need_auth = " << need_auth - << endl; - - CInode *cur = rdlock_path_pin_ref(mdr, need_auth); - if (!cur) return; - - // regular file? - if ((cur->inode.mode & INODE_TYPE_MASK) != INODE_MODE_FILE) { - dout(7) << "not a regular file " << *cur << endl; - reply_request(mdr, -EINVAL); // FIXME what error do we want? - return; - } - - // hmm, check permissions or something. - - - // O_TRUNC - if (flags & O_TRUNC) { - assert(cur->is_auth()); - - // xlock file size - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - if (cur->inode.size > 0) { - handle_client_opent(mdr); - return; - } - } - - // do it - _do_open(mdr, cur); -} - -void Server::_do_open(MDRequest *mdr, CInode *cur) -{ - MClientRequest *req = mdr->client_request; - int cmode = req->get_open_file_mode(); - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) - << " for " << req->get_source() - << " on " << *cur << endl; - - // hit pop - if (cmode == FILE_MODE_RW || - cmode == FILE_MODE_W) - mds->balancer->hit_inode(cur, META_POP_IWR); - else - mds->balancer->hit_inode(cur, META_POP_IRD); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(mdr, reply, cur); - - // journal? - if (cur->last_open_journaled == 0) { - queue_journal_open(cur); - maybe_journal_opens(); - } - -} - -void Server::queue_journal_open(CInode *in) -{ - dout(10) << "queue_journal_open on " << *in << endl; - - if (journal_open_queue.count(in) == 0) { - // pin so our pointer stays valid - in->get(CInode::PIN_BATCHOPENJOURNAL); - - // queue it up for a bit - journal_open_queue.insert(in); - } -} - - -void Server::journal_opens() -{ - dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << endl; - if (journal_open_queue.empty()) return; - - EOpen *le = 0; - - // check queued inodes - for (set::iterator p = journal_open_queue.begin(); - p != journal_open_queue.end(); - ++p) { - (*p)->put(CInode::PIN_BATCHOPENJOURNAL); - if ((*p)->is_any_caps()) { - if (!le) le = new EOpen; - le->add_inode(*p); - (*p)->last_open_journaled = mds->mdlog->get_write_pos(); - } - } - journal_open_queue.clear(); - - if (le) { - // journal - mds->mdlog->submit_entry(le); - - // add waiters to journal entry - for (list::iterator p = journal_open_waiters.begin(); - p != journal_open_waiters.end(); - ++p) - mds->mdlog->wait_for_sync(*p); - journal_open_waiters.clear(); - } else { - // nothing worth journaling here, just kick the waiters. - mds->queue_waiters(journal_open_waiters); - } -} - - - - -class C_MDS_open_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = 0; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv); - - // hit pop - mds->balancer->hit_inode(in, META_POP_IWR); - - // do the open - mds->server->_do_open(mdr, in); - } -}; - -class C_MDS_open_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // purge also... - mds->mdcache->purge_inode(&in->inode, 0); - mds->mdcache->wait_for_purge(in->inode.ino, 0, - new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); - } -}; - - -void Server::handle_client_opent(MDRequest *mdr) -{ - CInode *cur = mdr->ref; - assert(cur); - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, - pdv, ctime); - - // log + wait - EUpdate *le = new EUpdate("open_truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->inode, 0); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = 0; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : - mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // downgrade xlock to rdlock - //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); - - // set/pin ref inode for open() - mdr->ref = newi; - mdr->pin(newi); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // ok, do the open. - mds->server->handle_client_open(mdr); - } -}; - - -void Server::handle_client_openc(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - - bool excl = (req->args.open.flags & O_EXCL); - CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); - if (!dn) return; - - if (!dn->is_null()) { - // it existed. - if (req->args.open.flags & O_EXCL) { - dout(10) << "O_EXCL, target exists, failing with -EEXIST" << endl; - reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); - return; - } - - // pass to regular open handler. - handle_client_open(mdr); - return; - } - - // created null dn. - - // create inode. - mdr->now = g_clock.real_now(); - CInode *in = prepare_new_inode(mdr, dn->dir); - assert(in); - - // it's a file. - dn->pre_dirty(); - in->inode.mode = req->args.open.mode; - in->inode.mode |= INODE_MODE_FILE; - in->inode.version = dn->get_projected_version(); - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); - EUpdate *le = new EUpdate("openc"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in, &in->inode); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ -} - - - - - - - - - - - - - - diff --git a/branches/sage/pgs/mds/Server.h b/branches/sage/pgs/mds/Server.h deleted file mode 100644 index 59d00e1fa777b..0000000000000 --- a/branches/sage/pgs/mds/Server.h +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class LogEvent; -class C_MDS_rename_finish; -class MDRequest; -class EMetaBlob; -class PVList; -class MMDSSlaveRequest; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger) { - } - - // message handler - void dispatch(Message *m); - - - // -- sessions and recovery -- - utime_t reconnect_start; - set client_reconnect_gather; // clients i need a reconnect msg from. - set reconnected_caps; - - void handle_client_session(class MClientSession *m); - void _session_logged(entity_inst_t ci, bool open, version_t cmapv); - void terminate_sessions(); - void reconnect_clients(); - void handle_client_reconnect(class MClientReconnect *m); - void process_reconnect_cap(CInode *in, int from, inode_caps_reconnect_t& capinfo); - void add_reconnected_cap_inode(CInode *in) { - reconnected_caps.insert(in); - } - void process_reconnected_caps(); - void client_reconnect_failure(int from); - void reconnect_gather_finish(); - - - // -- requests -- - void handle_client_request(MClientRequest *m); - - void dispatch_client_request(MDRequest *mdr); - void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); - void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); - - void handle_slave_request(MMDSSlaveRequest *m); - void dispatch_slave_request(MDRequest *mdr); - void handle_slave_auth_pin(MDRequest *mdr); - void handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack); - - // some helpers - CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); - CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); - CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); - CInode* prepare_new_inode(MDRequest *mdr, CDir *dir); - - CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); - CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); - - CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); - //CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); - - version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); - void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); - - - // requests on existing inodes. - void handle_client_stat(MDRequest *mdr); - void handle_client_utime(MDRequest *mdr); - void handle_client_chmod(MDRequest *mdr); - void handle_client_chown(MDRequest *mdr); - void handle_client_readdir(MDRequest *mdr); - int encode_dir_contents(CDir *dir, list& inls, list& dnls); - void handle_client_truncate(MDRequest *mdr); - void handle_client_fsync(MDRequest *mdr); - - // open - void handle_client_open(MDRequest *mdr); - void handle_client_openc(MDRequest *mdr); // O_CREAT variant. - void handle_client_opent(MDRequest *mdr); // O_TRUNC variant. - void _do_open(MDRequest *mdr, CInode *ref); - - set journal_open_queue; // to be journal - list journal_open_waiters; - void queue_journal_open(CInode *in); - void add_journal_open_waiter(Context *c) { - journal_open_waiters.push_back(c); - } - void maybe_journal_opens() { - if (journal_open_queue.size() >= (unsigned)g_conf.mds_log_eopen_size) - journal_opens(); - } - void journal_opens(); - - // namespace changes - void handle_client_mknod(MDRequest *mdr); - void handle_client_mkdir(MDRequest *mdr); - void handle_client_symlink(MDRequest *mdr); - - // link - void handle_client_link(MDRequest *mdr); - void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_local_finish(MDRequest *mdr, - CDentry *dn, CInode *targeti, - version_t, version_t, version_t); - - void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t, version_t); - - void handle_slave_link_prep(MDRequest *mdr); - void _logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc); - void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc); - void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - - // unlink - void handle_client_unlink(MDRequest *mdr); - bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); - void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); - void _unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t, version_t); - - void _unlink_remote(MDRequest *mdr, CDentry *dn); - void _unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t, version_t); - - // rename - void handle_client_rename(MDRequest *mdr); - void _rename_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // helpers - void _rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // slaving - void handle_slave_rename_prep(MDRequest *mdr); - void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void handle_slave_rename_get_inode(MDRequest *mdr); - void handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m); - -}; - - - - -#endif diff --git a/branches/sage/pgs/mds/SimpleLock.h b/branches/sage/pgs/mds/SimpleLock.h deleted file mode 100644 index 42ab3a596d61f..0000000000000 --- a/branches/sage/pgs/mds/SimpleLock.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SIMPLELOCK_H -#define __SIMPLELOCK_H - -// -- lock types -- -// NOTE: this also defines the lock ordering! -#define LOCK_OTYPE_DN 1 - -#define LOCK_OTYPE_IVERSION 2 -#define LOCK_OTYPE_IFILE 3 -#define LOCK_OTYPE_IAUTH 4 -#define LOCK_OTYPE_ILINK 5 -#define LOCK_OTYPE_IDIRFRAGTREE 6 -#define LOCK_OTYPE_IDIR 7 - -//#define LOCK_OTYPE_DIR 7 // not used - -inline const char *get_lock_type_name(int t) { - switch (t) { - case LOCK_OTYPE_DN: return "dn"; - case LOCK_OTYPE_IVERSION: return "iversion"; - case LOCK_OTYPE_IFILE: return "ifile"; - case LOCK_OTYPE_IAUTH: return "iauth"; - case LOCK_OTYPE_ILINK: return "ilink"; - case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; - case LOCK_OTYPE_IDIR: return "idir"; - default: assert(0); - } -} - -// -- lock states -- -// sync <-> lock -#define LOCK_UNDEF 0 -// auth rep -#define LOCK_SYNC 1 // AR R . R . -#define LOCK_LOCK 2 // AR R W . . -#define LOCK_GLOCKR -3 // AR R . . . -#define LOCK_REMOTEXLOCK -50 // on NON-auth - -inline const char *get_simplelock_state_name(int n) { - switch (n) { - case LOCK_UNDEF: return "UNDEF"; - case LOCK_SYNC: return "sync"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_REMOTEXLOCK: return "remote_xlock"; - default: assert(0); - } -} - -class MDRequest; - -class SimpleLock { -public: - static const int WAIT_RD = (1<<0); // to read - static const int WAIT_WR = (1<<1); // to write - static const int WAIT_XLOCK = (1<<2); // to xlock (** dup) - static const int WAIT_STABLE = (1<<2); // for a stable state - static const int WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock - static const int WAIT_BITS = 4; - static const int WAIT_ALL = ((1< gather_set; // auth - - // local state - int num_rdlock; - MDRequest *xlock_by; - -public: - SimpleLock(MDSCacheObject *o, int t, int wo) : - parent(o), type(t), wait_offset(wo), - state(LOCK_SYNC), - num_rdlock(0), xlock_by(0) { } - virtual ~SimpleLock() {} - - // parent - MDSCacheObject *get_parent() { return parent; } - int get_type() { return type; } - - struct ptr_lt { - bool operator()(const SimpleLock* l, const SimpleLock* r) const { - // first sort by object type (dn < inode) - if ((l->type>LOCK_OTYPE_DN) < (r->type>LOCK_OTYPE_DN)) return true; - if ((l->type>LOCK_OTYPE_DN) == (r->type>LOCK_OTYPE_DN)) { - // then sort by object - if (l->parent->is_lt(r->parent)) return true; - if (l->parent == r->parent) { - // then sort by (inode) lock type - if (l->type < r->type) return true; - } - } - return false; - } - }; - - void decode_locked_state(bufferlist& bl) { - parent->decode_lock_state(type, bl); - } - void encode_locked_state(bufferlist& bl) { - parent->encode_lock_state(type, bl); - } - void finish_waiters(int mask, int r=0) { - parent->finish_waiting(mask << wait_offset, r); - } - void take_waiting(int mask, list& ls) { - parent->take_waiting(mask << wait_offset, ls); - } - void add_waiter(int mask, Context *c) { - parent->add_waiter(mask << wait_offset, c); - } - bool is_waiter_for(int mask) { - return parent->is_waiter_for(mask << wait_offset); - } - - - - // state - int get_state() { return state; } - int set_state(int s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - bool is_stable() { - return state >= 0; - } - - - // gather set - const set& get_gather_set() { return gather_set; } - void init_gather() { - for (map::const_iterator p = parent->replicas_begin(); - p != parent->replicas_end(); - ++p) - gather_set.insert(p->first); - } - bool is_gathering() { return !gather_set.empty(); } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - void remove_gather(int i) { - gather_set.erase(i); - } - - // ref counting - bool is_rdlocked() { return num_rdlock > 0; } - int get_rdlock() { - if (!num_rdlock) parent->get(MDSCacheObject::PIN_LOCK); - return ++num_rdlock; - } - int put_rdlock() { - assert(num_rdlock>0); - --num_rdlock; - if (num_rdlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - return num_rdlock; - } - int get_num_rdlocks() { return num_rdlock; } - - void get_xlock(MDRequest *who) { - assert(xlock_by == 0); - parent->get(MDSCacheObject::PIN_LOCK); - xlock_by = who; - } - void put_xlock() { - assert(xlock_by); - parent->put(MDSCacheObject::PIN_LOCK); - xlock_by = 0; - } - bool is_xlocked() { return xlock_by ? true:false; } - bool is_xlocked_by_other(MDRequest *mdr) { - return is_xlocked() && xlock_by != mdr; - } - MDRequest *get_xlocked_by() { return xlock_by; } - bool is_used() { - return is_xlocked() || is_rdlocked(); - } - - // encode/decode - void _encode(bufferlist& bl) { - ::_encode(state, bl); - ::_encode(gather_set, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(state, bl, off); - ::_decode(gather_set, bl, off); - } - - - // simplelock specifics - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKR: - return LOCK_LOCK; - case LOCK_SYNC: - return LOCK_SYNC; - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - /** replicate_relax - * called on first replica creation. - */ - void replicate_relax() { - assert(parent->is_auth()); - assert(!parent->is_replicated()); - if (state == LOCK_LOCK && !is_used()) - state = LOCK_SYNC; - } - bool remove_replica(int from) { - if (is_gathering(from)) { - remove_gather(from); - if (!is_gathering()) - return true; - } - return false; - } - bool do_import(int from, int to) { - if (!is_stable()) { - remove_gather(from); - remove_gather(to); - if (!is_gathering()) - return true; - } - if (!is_stable() && !is_gathering()) - return true; - return false; - } - - bool can_rdlock(MDRequest *mdr) { - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; // xlocked by me. (actually, is this right?) - //if (state == LOCK_LOCK && !xlock_by && parent->is_auth()) return true; - return (state == LOCK_SYNC); - } - bool can_xlock(MDRequest *mdr) { - if (mdr && xlock_by == mdr) { - assert(state == LOCK_LOCK); - return true; // auth or replica! xlocked by me. - } - if (state == LOCK_LOCK && parent->is_auth() && !xlock_by) return true; - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR); - else - return false; - } - - virtual void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_simplelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - -inline ostream& operator<<(ostream& out, SimpleLock& l) -{ - l.print(out); - return out; -} - - -#endif diff --git a/branches/sage/pgs/mds/events/EAnchor.h b/branches/sage/pgs/mds/events/EAnchor.h deleted file mode 100644 index 5980d40c17cd9..0000000000000 --- a/branches/sage/pgs/mds/events/EAnchor.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHOR_H -#define __MDS_EANCHOR_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchor : public LogEvent { -protected: - int op; - inodeno_t ino; - version_t atid; - vector trace; - version_t version; // anchor table version - int reqmds; - - public: - EAnchor() : LogEvent(EVENT_ANCHOR) { } - EAnchor(int o, inodeno_t i, version_t v, int rm) : - LogEvent(EVENT_ANCHOR), - op(o), ino(i), atid(0), version(v), reqmds(rm) { } - EAnchor(int o, version_t a, version_t v) : - LogEvent(EVENT_ANCHOR), - op(o), atid(a), version(v), reqmds(-1) { } - - void set_trace(vector& t) { trace = t; } - vector& get_trace() { return trace; } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&atid, sizeof(atid)); - ::_encode(trace, bl); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&reqmds, sizeof(reqmds)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, bl, off); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(reqmds), (char*)&reqmds); - off += sizeof(reqmds); - } - - void print(ostream& out) { - out << "EAnchor " << get_anchor_opname(op); - if (ino) out << " " << ino; - if (atid) out << " atid " << atid; - if (version) out << " v " << version; - if (reqmds >= 0) out << " by mds" << reqmds; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EAnchorClient.h b/branches/sage/pgs/mds/events/EAnchorClient.h deleted file mode 100644 index 7cd36453e17b9..0000000000000 --- a/branches/sage/pgs/mds/events/EAnchorClient.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHORCLIENT_H -#define __MDS_EANCHORCLIENT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchorClient : public LogEvent { -protected: - int op; - version_t atid; - - public: - EAnchorClient() : LogEvent(EVENT_ANCHORCLIENT) { } - EAnchorClient(int o, version_t at) : - LogEvent(EVENT_ANCHORCLIENT), - op(o), atid(at) { } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&atid, sizeof(atid)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - } - - void print(ostream& out) { - out << "EAnchorClient " << get_anchor_opname(op); - if (atid) out << " atid " << atid; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EExport.h b/branches/sage/pgs/mds/events/EExport.h deleted file mode 100644 index 29d8e0df08f49..0000000000000 --- a/branches/sage/pgs/mds/events/EExport.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORT_H -#define __EEXPORT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExport : public LogEvent { -public: - EMetaBlob metablob; // exported dir -protected: - dirfrag_t base; - set bounds; - -public: - EExport(CDir *dir) : LogEvent(EVENT_EXPORT), - base(dir->dirfrag()) { - metablob.add_dir_context(dir); - } - EExport() : LogEvent(EVENT_EXPORT) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "EExport " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&base, sizeof(base)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EImportFinish.h b/branches/sage/pgs/mds/events/EImportFinish.h deleted file mode 100644 index 0ee6d71ffdc13..0000000000000 --- a/branches/sage/pgs/mds/events/EImportFinish.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - dirfrag_t base; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - base(dir->dirfrag()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "EImportFinish " << base; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EImportStart.h b/branches/sage/pgs/mds/events/EImportStart.h deleted file mode 100644 index aa1902576542d..0000000000000 --- a/branches/sage/pgs/mds/events/EImportStart.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - dirfrag_t base; - list bounds; - - public: - EMetaBlob metablob; - - EImportStart(dirfrag_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - base(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - metablob._encode(bl); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EMetaBlob.h b/branches/sage/pgs/mds/events/EMetaBlob.h deleted file mode 100644 index e20b2b794b59d..0000000000000 --- a/branches/sage/pgs/mds/events/EMetaBlob.h +++ /dev/null @@ -1,445 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using namespace std; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - -class MDS; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&inode, sizeof(inode)); - if (inode.is_symlink()) - ::_encode(symlink, bl); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - void print(ostream& out) { - out << " fullbit dn " << dn << " dnv " << dnv - << " inode " << inode.ino - << " dirty=" << dirty << endl; - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - void print(ostream& out) { - out << " remotebit dn " << dn << " dnv " << dnv - << " ino " << ino - << " dirty=" << dirty << endl; - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - void print(ostream& out) { - out << " nullbit dn " << dn << " dnv " << dnv - << " dirty=" << dirty << endl; - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ - struct dirlump { - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - version_t dirv; - int state; - int nfull, nremote, nnull; - - private: - bufferlist dnbl; - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void print(dirfrag_t dirfrag, ostream& out) { - out << "dirlump " << dirfrag << " dirv " << dirv - << " state " << state - << " num " << nfull << "/" << nremote << "/" << nnull - << endl; - _decode_bits(); - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->print(out); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->print(out); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->print(out); - } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(dnbl); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - // anchor transactions included in this update. - list atids; - - // inode dirlocks (scatterlocks) i've touched. - map dirty_inode_mtimes; - - // ino's i've allocated - list allocated_inos; - version_t alloc_tablev; - - // inodes i've destroyed. - list< pair > truncated_inodes; - - // idempotent op(s) - list client_reqs; - - public: - void print(ostream& out) { - for (list::iterator p = lump_order.begin(); - p != lump_order.end(); - ++p) { - lump_map[*p].print(*p, out); - } - } - - void add_client_req(metareqid_t r) { - client_reqs.push_back(r); - } - - void add_anchor_transaction(version_t atid) { - atids.push_back(atid); - } - - void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) { - dirty_inode_mtimes[ino] = ctime; - } - - void add_allocated_ino(inodeno_t ino, version_t tablev) { - allocated_inos.push_back(ino); - alloc_tablev = tablev; - } - - void add_inode_truncate(const inode_t& inode, off_t newsize) { - truncated_inodes.push_back(pair(inode, newsize)); - } - - void add_null_dentry(CDentry *dn, bool dirty) { - // add the dir - dirlump& lump = add_dir(dn->get_dir(), false); - - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - - void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { - if (!rino) - rino = dn->get_remote_ino(); - - dirlump& lump = add_dir(dn->get_dir(), false); - - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, - dirty)); - } - - // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - if (!in) in = dn->get_inode(); - - dirlump& lump = add_dir(dn->get_dir(), false); - - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - if (pi) lump.get_dfull().front().inode = *pi; - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - if (pi) lump.get_dfull().back().inode = *pi; - return &lump.get_dfull().back().inode; - } - } - - // convenience: primary or remote? figure it out. - inode_t *add_dentry(CDentry *dn, bool dirty) { - // primary or remote - if (dn->is_remote()) { - add_remote_dentry(dn, dirty); - return 0; - } else if (dn->is_null()) { - add_null_dentry(dn, dirty); - return 0; - } - assert(dn->is_primary()); - return add_primary_dentry(dn, dirty); - } - - - dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { - dirfrag_t df = dir->dirfrag(); - if (lump_map.count(df) == 0) { - lump_order.push_back(df); - lump_map[df].dirv = dir->get_projected_version(); - } - dirlump& l = lump_map[df]; - if (complete) l.mark_complete(); - if (dirty) l.mark_dirty(); - return l; - } - - static const int TO_AUTH_SUBTREE_ROOT = 0; // default. - static const int TO_ROOT = 1; - - void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->dirfrag())) - return; - - // stop at subtree root? - if (mode == TO_AUTH_SUBTREE_ROOT && - dir->is_subtree_root() && dir->is_auth()) - return; - - // stop at root/stray - CInode *diri = dir->get_inode(); - if (!diri->get_parent_dn()) - return; - - // add parent dn - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), mode); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int32_t n = lump_map.size(); - ::_encode(n, bl); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - dirfrag_t dirfrag = *i; - ::_encode(dirfrag, bl); - lump_map[*i]._encode(bl); - } - ::_encode(atids, bl); - ::_encode(dirty_inode_mtimes, bl); - ::_encode(allocated_inos, bl); - if (!allocated_inos.empty()) - ::_encode(alloc_tablev, bl); - ::_encode(truncated_inodes, bl); - ::_encode(client_reqs, bl); - } - void _decode(bufferlist& bl, int& off) { - int32_t n; - ::_decode(n, bl, off); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EOPEN_H -#define __MDS_EOPEN_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EOpen : public LogEvent { -public: - EMetaBlob metablob; - list inos; - - EOpen() : LogEvent(EVENT_OPEN) { } - EOpen(CInode *in) : LogEvent(EVENT_OPEN) { - add_inode(in); - } - void print(ostream& out) { - out << "EOpen " << metablob; - } - - void add_inode(CInode *in) { - inos.push_back(in->ino()); - metablob.add_primary_dentry(in->get_parent_dn(), false); - } - - void encode_payload(bufferlist& bl) { - ::_encode(inos, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(inos, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EPurgeFinish.h b/branches/sage/pgs/mds/events/EPurgeFinish.h deleted file mode 100644 index b0c727bff305b..0000000000000 --- a/branches/sage/pgs/mds/events/EPurgeFinish.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - off_t newsize; - - public: - EPurgeFinish(inodeno_t i, off_t s) : - LogEvent(EVENT_PURGEFINISH), - ino(i), newsize(s) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino << " to " << newsize; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&newsize, sizeof(newsize)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(newsize), (char*)&newsize); - off += sizeof(newsize); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/ESession.h b/branches/sage/pgs/mds/events/ESession.h deleted file mode 100644 index 953eff2d0e01c..0000000000000 --- a/branches/sage/pgs/mds/events/ESession.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESESSION_H -#define __MDS_ESESSION_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" - -class ESession : public LogEvent { - protected: - entity_inst_t client_inst; - bool open; // open or close - version_t cmapv; // client map version - - public: - ESession() : LogEvent(EVENT_SESSION) { } - ESession(entity_inst_t inst, bool o, version_t v) : - LogEvent(EVENT_SESSION), - client_inst(inst), - open(o), - cmapv(v) { - } - - void encode_payload(bufferlist& bl) { - ::_encode(client_inst, bl); - ::_encode(open, bl); - ::_encode(cmapv, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(client_inst, bl, off); - ::_decode(open, bl, off); - ::_decode(cmapv, bl, off); - } - - - void print(ostream& out) { - if (open) - out << "ESession " << client_inst << " open cmapv " << cmapv; - else - out << "ESession " << client_inst << " close cmapv " << cmapv; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/ESlaveUpdate.h b/branches/sage/pgs/mds/events/ESlaveUpdate.h deleted file mode 100644 index 51539234d4617..0000000000000 --- a/branches/sage/pgs/mds/events/ESlaveUpdate.h +++ /dev/null @@ -1,70 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESLAVEUPDATE_H -#define __MDS_ESLAVEUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESlaveUpdate : public LogEvent { -public: - const static int OP_PREPARE = 1; - const static int OP_COMMIT = 2; - const static int OP_ROLLBACK = 3; - - string type; - metareqid_t reqid; - int master; - int op; // prepare, commit, abort - EMetaBlob metablob; - - ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } - ESlaveUpdate(const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), - type(s), - reqid(ri), - master(mastermds), - op(o) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << " " << op; - out << " " << reqid; - out << " for mds" << master; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - ::_encode(reqid, bl); - ::_encode(master, bl); - ::_encode(op, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - ::_decode(reqid, bl, off); - ::_decode(master, bl, off); - ::_decode(op, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EString.h b/branches/sage/pgs/mds/events/EString.h deleted file mode 100644 index bb414160ca690..0000000000000 --- a/branches/sage/pgs/mds/events/EString.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - event = bl.c_str() + off; - off += event.length() + 1; - } - void encode_payload(bufferlist& bl) { - bl.append(event.c_str(), event.length()+1); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/pgs/mds/events/ESubtreeMap.h b/branches/sage/pgs/mds/events/ESubtreeMap.h deleted file mode 100644 index 3997a6b5686c1..0000000000000 --- a/branches/sage/pgs/mds/events/ESubtreeMap.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESUBTREEMAP_H -#define __MDS_ESUBTREEMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESubtreeMap : public LogEvent { -public: - EMetaBlob metablob; - map > subtrees; - - ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP) { } - - void print(ostream& out) { - out << "subtree_map " << subtrees.size() << " subtrees " - << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(subtrees, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(subtrees, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/pgs/mds/events/EUpdate.h b/branches/sage/pgs/mds/events/EUpdate.h deleted file mode 100644 index 02c5d3ece2569..0000000000000 --- a/branches/sage/pgs/mds/events/EUpdate.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(const char *s) : LogEvent(EVENT_UPDATE), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/pgs/mds/journal.cc b/branches/sage/pgs/mds/journal.cc deleted file mode 100644 index e169cee1f51b7..0000000000000 --- a/branches/sage/pgs/mds/journal.cc +++ /dev/null @@ -1,1007 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" -#include "events/ESubtreeMap.h" -#include "events/ESession.h" - -#include "events/EMetaBlob.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "Server.h" -#include "Migrator.h" -#include "AnchorTable.h" -#include "AnchorClient.h" -#include "IdAllocator.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << endl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << endl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << endl; -} - - - -// ----------------------- -// EMetaBlob - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. i.e., authority().first != us. - * in particular, auth of is not enough, we need to - * wait for . - * - * note that this check is overly conservative, in that we'll - * try to flush the dir again if we reimport the subtree, even though - * later journal entries contain the same dirty data (from the import). - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - if (dir->get_committed_version() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // yay - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", exporting on " << *ex << endl; - return false; - } else { - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", importing on " << *ex << endl; - return false; - } - } - - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // not committed. - } - - assert(0); // i goofed the logic - } - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.has_expired anchor transaction " << *p - << " not yet acked" << endl; - return false; - } - } - - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.has_expired dirty mtime dirlock hasn't flushed on " << *in << endl; - return false; - } - } - } - - // allocated_ios - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " > " << cv - << ", still dirty" << endl; - return false; // still dirty - } else { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " <= " << cv - << ", already flushed" << endl; - } - } - - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.has_expired still purging inode " << p->first.ino - << " to " << p->second << endl; - return false; - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.has_expired still have completed request " << *p - << endl; - return false; - } - } - - - return true; // all dirlumps expired, etc. -} - - -void EMetaBlob::expire(MDS *mds, Context *c) -{ - map commit; // dir -> version needed - list waitfor_export; - list waitfor_import; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - if (dir->get_committed_version() >= lp->second.dirv) { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << endl; - continue; // yay - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for export finish on " << *ex << endl; - waitfor_export.push_back(ex); - continue; - } else { - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for import finish on " << *ex << endl; - waitfor_import.push_back(ex); - continue; - } - } - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << endl; - commit[dir] = MAX(commit[dir], lp->second.dirv); - ncommit++; - continue; - } - - assert(0); // hrm - } - - // set up gather context - C_Gather *gather = new C_Gather(c); - - // do or wait for exports and commits - for (map::iterator p = commit.begin(); - p != commit.end(); - ++p) { - if (p->first->can_auth_pin()) - p->first->commit(p->second, gather->new_sub()); - else - // pbly about to export|split|merge. - // just wait for it to unfreeze, then retry - p->first->add_waiter(CDir::WAIT_AUTHPINNABLE, gather->new_sub()); - } - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - for (list::iterator p = waitfor_import.begin(); - p != waitfor_import.end(); - ++p) - (*p)->add_waiter(CDir::WAIT_IMPORTED, gather->new_sub()); - - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.expire anchor transaction " << *p - << " not yet acked, waiting" << endl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.expire dirty mtime dirlock hasn't flushed, waiting on " - << *in << endl; - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - } - } - - // allocated_inos - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.expire saving idalloc table, need " << alloc_tablev << endl; - mds->idalloc->save(gather->new_sub(), alloc_tablev); - } - } - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.expire waiting for purge of inode " << p->first.ino - << " to " << p->second << endl; - mds->mdcache->wait_for_purge(p->first.ino, p->second, gather->new_sub()); - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.expire waiting on completed request " << *p - << endl; - mds->clientmap.add_trim_waiter(*p, gather->new_sub()); - } - } - -} - -void EMetaBlob::replay(MDS *mds) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << endl; - dirlump &lump = lump_map[*lp]; - - // the dir - CDir *dir = mds->mdcache->get_dirfrag(*lp); - if (!dir) { - // hmm. do i have the inode? - CInode *diri = mds->mdcache->get_inode((*lp).ino); - if (!diri) { - if ((*lp).ino == MDS_INO_ROOT) { - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << endl; - } else if (MDS_INO_IS_STRAY((*lp).ino)) { - int whose = (*lp).ino - MDS_INO_STRAY_OFFSET; - diri = mds->mdcache->create_stray_inode(whose); - dout(10) << "EMetaBlob.replay created stray " << *diri << endl; - } else { - assert(0); - } - } - // create the dirfrag - dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); - - if ((*lp).ino < MDS_INO_BASE) - mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNKNOWN); - - dout(10) << "EMetaBlob.replay added dir " << *dir << endl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry( p->dn ); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - in = new CInode(mds->mdcache); - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - dir->link_inode(dn, in); - if (p->dirty) in->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *in << endl; - } else { - if (in->get_parent_dn()) { - dout(10) << "EMetaBlob.replay unlinking " << *in << endl; - in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); - } - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - dir->link_inode(dn, in); - if (p->dirty) in->_mark_dirty(); - dout(10) << "EMetaBlob.replay linked " << *in << endl; - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn, p->ino); - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << endl; - dir->unlink_inode(dn); - } - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << endl; - dir->unlink_inode(dn); - } - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - } - - // anchor transactions - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << endl; - mds->anchorclient->got_journaled_agree(*p); - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << endl; - in->dirlock.set_updated(); - } - - // allocated_inos - if (!allocated_inos.empty()) { - if (mds->idalloc->get_version() >= alloc_tablev) { - dout(10) << "EMetaBlob.replay idalloc tablev " << alloc_tablev - << " <= table " << mds->idalloc->get_version() << endl; - } else { - for (list::iterator p = allocated_inos.begin(); - p != allocated_inos.end(); - ++p) { - dout(10) << " EMetaBlob.replay idalloc " << *p << " tablev " << alloc_tablev - << " - 1 == table " << mds->idalloc->get_version() << endl; - assert(alloc_tablev-1 == mds->idalloc->get_version()); - - inodeno_t ino = mds->idalloc->alloc_id(); - assert(ino == *p); // this should match. - - assert(alloc_tablev == mds->idalloc->get_version()); - } - } - } - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - dout(10) << "EMetaBlob.replay will purge truncated inode " << p->first.ino - << " to " << p->second << endl; - mds->mdcache->add_recovered_purge(p->first, p->second); - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) - mds->clientmap.add_completed_request(*p); -} - -// ----------------------- -// ESession -bool ESession::has_expired(MDS *mds) -{ - if (mds->clientmap.get_committed() >= cmapv) { - dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committed() - << " >= " << cmapv << " has committed" << endl; - return true; - } else if (mds->clientmap.get_committing() >= cmapv) { - dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committing() - << " >= " << cmapv << " is still committing" << endl; - return false; - } else { - dout(10) << "ESession.has_expired clientmap " << mds->clientmap.get_version() - << " > " << cmapv << ", need to save" << endl; - return false; - } -} - -void ESession::expire(MDS *mds, Context *c) -{ - dout(10) << "ESession.expire saving clientmap" << endl; - mds->clientmap.save(c, cmapv); -} - -void ESession::replay(MDS *mds) -{ - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << endl; - - // hrm, this isn't very pretty. - if (!open) - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - - } else { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << endl; - assert(mds->clientmap.get_version() + 1 == cmapv); - if (open) { - mds->clientmap.open_session(client_inst); - } else { - mds->clientmap.close_session(client_inst.name.num()); - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - } - mds->clientmap.reset_projected(); // make it follow version. - } -} - - - -// ----------------------- -// EAnchor - -bool EAnchor::has_expired(MDS *mds) -{ - version_t cv = mds->anchortable->get_committed_version(); - if (cv < version) { - dout(10) << "EAnchor.has_expired v " << version << " > " << cv - << ", still dirty" << endl; - return false; // still dirty - } else { - dout(10) << "EAnchor.has_expired v " << version << " <= " << cv - << ", already flushed" << endl; - return true; // already flushed - } -} - -void EAnchor::expire(MDS *mds, Context *c) -{ - dout(10) << "EAnchor.expire saving anchor table" << endl; - mds->anchortable->save(c); -} - -void EAnchor::replay(MDS *mds) -{ - if (mds->anchortable->get_version() >= version) { - dout(10) << "EAnchor.replay event " << version - << " <= table " << mds->anchortable->get_version() << endl; - } else { - dout(10) << " EAnchor.replay event " << version - << " - 1 == table " << mds->anchortable->get_version() << endl; - assert(version-1 == mds->anchortable->get_version()); - - switch (op) { - // anchortable - case ANCHOR_OP_CREATE_PREPARE: - mds->anchortable->create_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_DESTROY_PREPARE: - mds->anchortable->destroy_prepare(ino, reqmds); - break; - case ANCHOR_OP_UPDATE_PREPARE: - mds->anchortable->update_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_COMMIT: - mds->anchortable->commit(atid); - break; - - default: - assert(0); - } - - assert(version == mds->anchortable->get_version()); - } -} - - -// EAnchorClient - -bool EAnchorClient::has_expired(MDS *mds) -{ - return true; -} - -void EAnchorClient::expire(MDS *mds, Context *c) -{ - assert(0); -} - -void EAnchorClient::replay(MDS *mds) -{ - dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << endl; - - switch (op) { - // anchorclient - case ANCHOR_OP_ACK: - mds->anchorclient->got_journaled_ack(atid); - break; - - default: - assert(0); - } -} - - -// ----------------------- -// EUpdate - -bool EUpdate::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EUpdate::expire(MDS *mds, Context *c) -{ - metablob.expire(mds, c); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds); -} - - -// ------------------------ -// EOpen - -bool EOpen::has_expired(MDS *mds) -{ - for (list::iterator p = inos.begin(); p != inos.end(); ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (in && - in->is_any_caps() && - !(in->last_open_journaled > get_start_off() || - in->last_open_journaled == 0)) { - dout(10) << "EOpen.has_expired still refer to caps on " << *in << endl; - return false; - } - } - return true; -} - -void EOpen::expire(MDS *mds, Context *c) -{ - dout(10) << "EOpen.expire " << endl; - - if (mds->mdlog->is_capped()) { - dout(0) << "uh oh, log is capped, but i have unexpired opens." << endl; - assert(0); - } - - for (list::iterator p = inos.begin(); p != inos.end(); ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (!in) continue; - if (!in->is_any_caps()) continue; - - dout(10) << "EOpen.expire " << in->ino() - << " last_open_journaled " << in->last_open_journaled << endl; - - mds->server->queue_journal_open(in); - } - mds->server->add_journal_open_waiter(c); - mds->server->maybe_journal_opens(); -} - -void EOpen::replay(MDS *mds) -{ - dout(10) << "EOpen.replay " << endl; - metablob.replay(mds); -} - - -// ----------------------- -// ESlaveUpdate - -bool ESlaveUpdate::has_expired(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": haven't yet seen commit|rollback" << endl; - return false; - } - else if (mds->mdcache->ambiguous_slave_updates[reqid]) { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": committed, checking metablob" << endl; - bool exp = metablob.has_expired(mds); - if (exp) - mds->mdcache->ambiguous_slave_updates.erase(reqid); - return exp; - } - else { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": aborted" << endl; - mds->mdcache->ambiguous_slave_updates.erase(reqid); - return true; - } - - case ESlaveUpdate::OP_COMMIT: - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->waiting_for_slave_update_commit.count(reqid)) { - dout(10) << "ESlaveUpdate.has_expired " - << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ") - << reqid << " for mds" << master - << ": noting commit, kicking prepare waiter" << endl; - mds->mdcache->ambiguous_slave_updates[reqid] = (op == ESlaveUpdate::OP_COMMIT); - mds->mdcache->waiting_for_slave_update_commit[reqid]->finish(0); - delete mds->mdcache->waiting_for_slave_update_commit[reqid]; - mds->mdcache->waiting_for_slave_update_commit.erase(reqid); - } else { - dout(10) << "ESlaveUpdate.has_expired " - << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ") - << reqid << " for mds" << master - << ": no prepare waiter, ignoring" << endl; - } - return true; - - default: - assert(0); - } -} - -void ESlaveUpdate::expire(MDS *mds, Context *c) -{ - assert(op == ESlaveUpdate::OP_PREPARE); - - if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) { - // wait - dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master - << ": waiting for commit|rollback" << endl; - mds->mdcache->waiting_for_slave_update_commit[reqid] = c; - } else { - // we committed.. expire the metablob - assert(mds->mdcache->ambiguous_slave_updates[reqid] == true); - dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master - << ": waiting for metablob to expire" << endl; - metablob.expire(mds, c); - } -} - -void ESlaveUpdate::replay(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - // FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something - dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master - << ": saving blob for later commit" << endl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - mds->mdcache->uncommitted_slave_updates[master][reqid] = metablob; - break; - - case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": applying previously saved blob" << endl; - mds->mdcache->uncommitted_slave_updates[master][reqid].replay(mds); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": ignoring, no previously saved blob" << endl; - } - break; - - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": discarding previously saved blob" << endl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": ignoring, no previously saved blob" << endl; - } - break; - - default: - assert(0); - } -} - - -// ----------------------- -// ESubtreeMap - -bool ESubtreeMap::has_expired(MDS *mds) -{ - if (mds->mdlog->get_last_subtree_map_offset() > get_start_off()) { - dout(10) << "ESubtreeMap.has_expired -- there's a newer map" << endl; - return true; - } else if (mds->mdlog->is_capped()) { - dout(10) << "ESubtreeMap.has_expired -- log is capped, allowing map to expire" << endl; - return true; - } else { - dout(10) << "ESubtreeMap.has_expired -- not until there's a newer map written" - << " (" << get_start_off() << " >= " << mds->mdlog->get_last_subtree_map_offset() << ")" - << endl; - return false; - } -} - -void ESubtreeMap::expire(MDS *mds, Context *c) -{ - dout(10) << "ESubtreeMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << endl; - mds->mdlog->add_subtree_map_expire_waiter(c); -} - -void ESubtreeMap::replay(MDS *mds) -{ - if (mds->mdcache->is_subtrees()) { - dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << endl; - } else { - dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << endl; - - // first, stick the spanning tree in my cache - //metablob.print(cout); - metablob.replay(mds); - - // restore import/export maps - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); - } - } - mds->mdcache->show_subtrees(); -} - - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ - assert(0); -} - -void EPurgeFinish::replay(MDS *mds) -{ - dout(10) << "EPurgeFinish.replay " << ino << " to " << newsize << endl; - mds->mdcache->remove_recovered_purge(ino, newsize); -} - - - - - -// ========================================================================= - -// ----------------------- -// EExport - -bool EExport::has_expired(MDS *mds) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - if (!dir) return true; - if (!mds->mdcache->migrator->is_exporting(dir)) - return true; - dout(10) << "EExport.has_expired still exporting " << *dir << endl; - return false; -} - -void EExport::expire(MDS *mds, Context *c) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExport.expire waiting for export of " << *dir << endl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExport::replay(MDS *mds) -{ - dout(10) << "EExport.replay " << base << endl; - metablob.replay(mds); - - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - - set realbounds; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = mds->mdcache->get_dirfrag(*p); - assert(bd); - realbounds.insert(bd); - } - - // adjust auth away - mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, pair(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)); - mds->mdcache->try_subtree_merge(dir); -} - - -// ----------------------- -// EImportStart - -bool EImportStart::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EImportStart::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportStart.expire " << base << endl; - metablob.expire(mds, c); -} - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << base << endl; - metablob.replay(mds); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(base, bounds); -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - if (mds->mdcache->have_ambiguous_import(base)) { - dout(10) << "EImportFinish.replay " << base << " success=" << success << endl; - if (success) - mds->mdcache->finish_ambiguous_import(base); - else - mds->mdcache->cancel_ambiguous_import(base); - } else { - dout(10) << "EImportFinish.replay " << base << " success=" << success - << ", predates my subtree_map start point, ignoring" - << endl; - // verify that? - } -} - - - - - diff --git a/branches/sage/pgs/mds/mdstypes.h b/branches/sage/pgs/mds/mdstypes.h deleted file mode 100644 index 92299115c9f2e..0000000000000 --- a/branches/sage/pgs/mds/mdstypes.h +++ /dev/null @@ -1,584 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" -#include "include/Context.h" - -#include - -#include "include/frag.h" - - -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORTABLE 11 - -#define MAX_MDS 0x100 - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_ANCHORTABLE 3 -#define MDS_INO_LOG_OFFSET 0x100 -#define MDS_INO_IDS_OFFSET 0x200 -#define MDS_INO_CLIENTMAP_OFFSET 0x300 -#define MDS_INO_STRAY_OFFSET 0x400 -#define MDS_INO_BASE 0x1000 - -#define MDS_INO_STRAY(x) (MDS_INO_STRAY_OFFSET+((unsigned)x)) -#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < MDS_INO_STRAY_OFFSET+MAX_MDS) - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -struct metareqid_t { - int client; - tid_t tid; - metareqid_t() : client(-1), tid(0) {} - metareqid_t(int c, tid_t t) : client(c), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const metareqid_t& r) { - return out << "client" << r.client << ":" << r.tid; -} - -inline bool operator==(const metareqid_t& l, const metareqid_t& r) { - return (l.client == r.client) && (l.tid == r.tid); -} -inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { - return (l.client != r.client) || (l.tid != r.tid); -} -inline bool operator<(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid < r.tid); -} -inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid <= r.tid); -} -inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } -inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const metareqid_t &r) const { - hash H; - return H(r.client) ^ H(r.tid); - } - }; -} - - -// inode caps info for client reconnect -struct inode_caps_reconnect_t { - int32_t wanted; - int32_t issued; - off_t size; - utime_t mtime, atime; - - inode_caps_reconnect_t() {} - inode_caps_reconnect_t(int w, int i) : - wanted(w), issued(i), size(0) {} - inode_caps_reconnect_t(int w, int i, off_t sz, utime_t mt, utime_t at) : - wanted(w), issued(i), size(sz), mtime(mt), atime(at) {} -}; - - -// ================================================================ -// dir frag - -struct dirfrag_t { - inodeno_t ino; - frag_t frag; - - dirfrag_t() { } - dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } -}; - -inline ostream& operator<<(ostream& out, const dirfrag_t& df) { - return out << df.ino << "#" << df.frag; -} -inline bool operator<(dirfrag_t l, dirfrag_t r) { - if (l.ino < r.ino) return true; - if (l.ino == r.ino && l.frag < r.frag) return true; - return false; -} -inline bool operator==(dirfrag_t l, dirfrag_t r) { - return l.ino == r.ino && l.frag == r.frag; -} - - -// ================================================================ - -/* meta_load_t - * hierarchical load for an inode/dir and it's children - */ -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_DWR 2 -//#define META_POP_LOG 3 -//#define META_POP_FDIR 4 -//#define META_POP_CDIR 4 -#define META_NPOP 3 - -class meta_load_t { - public: - DecayCounter pop[META_NPOP]; - - double meta_load() { - return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get(); - } - - void take(meta_load_t& other) { - for (int i=0; i"; -} - - -inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r) -{ - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - - - -// ================================================================ - -//#define MDS_PIN_REPLICATED 1 -//#define MDS_STATE_AUTH (1<<0) - -class MLock; -class SimpleLock; - -class MDSCacheObject; - -// -- authority delegation -- -// directory authority types -// >= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 -#define CDIR_AUTH_DEFAULT pair(-1, -2) -#define CDIR_AUTH_UNDEF pair(-2, -2) -//#define CDIR_AUTH_ROOTINODE pair( 0, -2) - - - -// print hack -struct mdsco_db_line_prefix { - MDSCacheObject *object; - mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} -}; -ostream& operator<<(ostream& out, mdsco_db_line_prefix o); - -// printer -ostream& operator<<(ostream& out, MDSCacheObject &o); - -class MDSCacheObjectInfo { -public: - inodeno_t ino; - dirfrag_t dirfrag; - string dname; - - void _encode(bufferlist& bl) const { - ::_encode(ino, bl); - ::_encode(dirfrag, bl); - ::_encode(dname, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(dirfrag, bl, off); - ::_decode(dname, bl, off); - } -}; - - -class MDSCacheObject { - public: - // -- pins -- - const static int PIN_REPLICATED = 1000; - const static int PIN_DIRTY = 1001; - const static int PIN_LOCK = -1002; - const static int PIN_REQUEST = -1003; - const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; - - const char *generic_pin_name(int p) { - switch (p) { - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_LOCK: return "lock"; - case PIN_REQUEST: return "request"; - case PIN_WAITER: return "waiter"; - case PIN_DIRTYSCATTERED: return "dirtyscattered"; - default: assert(0); - } - } - - // -- state -- - const static int STATE_AUTH = (1<<30); - const static int STATE_DIRTY = (1<<29); - const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy - - // -- wait -- - const static int WAIT_SINGLEAUTH = (1<<30); - const static int WAIT_AUTHPINNABLE = (1<<29); - - - // ============================================ - // cons - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // printing - virtual void print(ostream& out) = 0; - virtual ostream& print_db_line_prefix(ostream& out) { - return out << "mdscacheobject(" << this << ") "; - } - - // -------------------------------------------- - // state - protected: - unsigned state; // state bits - - public: - unsigned get_state() { return state; } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - unsigned state_test(unsigned mask) { return state & mask; } - void state_reset(unsigned s) { state = s; } - - bool is_auth() { return state_test(STATE_AUTH); } - bool is_dirty() { return state_test(STATE_DIRTY); } - bool is_clean() { return !is_dirty(); } - bool is_rejoining() { return state_test(STATE_REJOINING); } - - // -------------------------------------------- - // authority - virtual pair authority() = 0; - bool is_ambiguous_auth() { - return authority().second != CDIR_AUTH_UNKNOWN; - } - - // -------------------------------------------- - // pins -protected: - int ref; // reference count - multiset ref_set; - - public: - int get_num_ref() { return ref; } - bool is_pinned_by(int by) { return ref_set.count(by); } - multiset& get_ref_set() { return ref_set; } - virtual const char *pin_name(int by) = 0; - - virtual void last_put() {} - virtual void bad_put(int by) { - assert(ref_set.count(by) > 0); - assert(ref > 0); - } - void put(int by) { - if (ref == 0 || ref_set.count(by) == 0) { - bad_put(by); - } else { - ref--; - ref_set.erase(ref_set.find(by)); - assert(ref == (int)ref_set.size()); - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { - assert(by < 0 || ref_set.count(by) == 0); - assert(0); - } - void get(int by) { - if (by >= 0 && ref_set.count(by)) { - bad_get(by); - } else { - if (ref == 0) - first_get(); - ref++; - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } - } - - void print_pin_set(ostream& out) { - multiset::iterator it = ref_set.begin(); - while (it != ref_set.end()) { - out << " " << pin_name(*it); - int last = *it; - int c = 1; - do { - it++; - if (it == ref_set.end()) break; - } while (*it == last); - if (c > 1) - out << "*" << c; - } - } - - - // -------------------------------------------- - // auth pins - virtual bool can_auth_pin() = 0; - virtual void auth_pin() = 0; - virtual void auth_unpin() = 0; - - - // -------------------------------------------- - // replication - protected: - map replicas; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - bool is_replicated() { return !replicas.empty(); } - bool is_replica(int mds) { return replicas.count(mds); } - int num_replicas() { return replicas.size(); } - int add_replica(int mds) { - if (replicas.count(mds)) - return ++replicas[mds]; // inc nonce - if (replicas.empty()) - get(PIN_REPLICATED); - return replicas[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replicas.empty()) - get(PIN_REPLICATED); - replicas[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replicas.count(mds)); - return replicas[mds]; - } - void remove_replica(int mds) { - assert(replicas.count(mds)); - replicas.erase(mds); - if (replicas.empty()) - put(PIN_REPLICATED); - } - void clear_replicas() { - if (!replicas.empty()) - put(PIN_REPLICATED); - replicas.clear(); - } - map::iterator replicas_begin() { return replicas.begin(); } - map::iterator replicas_end() { return replicas.end(); } - const map& get_replicas() { return replicas; } - void list_replicas(set& ls) { - for (map::const_iterator p = replicas.begin(); - p != replicas.end(); - ++p) - ls.insert(p->first); - } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } - - - // --------------------------------------------- - // waiting - protected: - multimap waiting; - - public: - bool is_waiter_for(int mask) { - return waiting.count(mask) > 0; // FIXME: not quite right. - } - virtual void add_waiter(int mask, Context *c) { - if (waiting.empty()) - get(PIN_WAITER); - waiting.insert(pair(mask, c)); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "add_waiter " << hex << mask << dec << " " << c - << " on " << *this - << endl; - - } - virtual void take_waiting(int mask, list& ls) { - if (waiting.empty()) return; - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "take_waiting mask " << hex << mask << dec << " took " << it->second - << " tag " << it->first - << " on " << *this - << endl; - waiting.erase(it++); - } else { - pdout(10,g_conf.debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second - << " tag " << it->first - << " on " << *this - << endl; - it++; - } - } - if (waiting.empty()) - put(PIN_WAITER); - } - void finish_waiting(int mask, int result = 0) { - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); - } - - - // --------------------------------------------- - // locking - // noop unless overloaded. - virtual SimpleLock* get_lock(int type) { assert(0); return 0; } - virtual void set_object_info(MDSCacheObjectInfo &info) { assert(0); } - virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } - virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } - virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } - - - // --------------------------------------------- - // ordering - virtual bool is_lt(const MDSCacheObject *r) const = 0; - struct ptr_lt { - bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { - return l->is_lt(r); - } - }; - -}; - -inline ostream& operator<<(ostream& out, MDSCacheObject &o) { - o.print(out); - return out; -} - -inline ostream& operator<<(ostream& out, const MDSCacheObjectInfo &info) { - if (info.ino) return out << info.ino; - if (info.dname.length()) return out << info.dirfrag << "/" << info.dname; - return out << info.dirfrag; -} - -inline ostream& operator<<(ostream& out, mdsco_db_line_prefix o) { - o.object->print_db_line_prefix(out); - return out; -} - - -#endif diff --git a/branches/sage/pgs/messages/MAnchor.h b/branches/sage/pgs/messages/MAnchor.h deleted file mode 100644 index 6ceb8981244fa..0000000000000 --- a/branches/sage/pgs/messages/MAnchor.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/Anchor.h" - - -class MAnchor : public Message { - int op; - inodeno_t ino; - vector trace; - version_t atid; // anchor table version. - - public: - MAnchor() {} - MAnchor(int o, inodeno_t i, version_t v=0) : - Message(MSG_MDS_ANCHOR), - op(o), ino(i), atid(v) { } - - virtual char *get_type_name() { return "anchor"; } - void print(ostream& o) { - o << "anchor(" << get_anchor_opname(op); - if (ino) o << " " << ino; - if (atid) o << " atid " << atid; - if (!trace.empty()) o << ' ' << trace; - o << ")"; - } - - void set_trace(vector& trace) { - this->trace = trace; - } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - version_t get_atid() { return atid; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&atid, sizeof(atid)); - ::_encode(trace, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MCacheExpire.h b/branches/sage/pgs/messages/MCacheExpire.h deleted file mode 100644 index 015aa562038a7..0000000000000 --- a/branches/sage/pgs/messages/MCacheExpire.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - -public: - /* - group things by realm (auth delgation root), since that's how auth is determined. - that makes it less work to process when exports are in progress. - */ - struct realm { - map inodes; - map dirs; - map > dentries; - }; - map realms; - - int get_from() { return from; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { - realms[r].inodes[ino] = nonce; - } - void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { - realms[r].dirs[df] = nonce; - } - void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { - realms[r].dentries[df][dn] = nonce; - } - - void add_realm(dirfrag_t df, realm& r) { - realm& myr = realms[df]; - for (map::iterator p = r.inodes.begin(); - p != r.inodes.end(); - ++p) - myr.inodes[p->first] = p->second; - for (map::iterator p = r.dirs.begin(); - p != r.dirs.end(); - ++p) - myr.dirs[p->first] = p->second; - for (map >::iterator p = r.dentries.begin(); - p != r.dentries.end(); - ++p) - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - myr.dentries[p->first][q->first] = q->second; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - int nr; - payload.copy(off, sizeof(nr), (char*)&nr); - off += sizeof(nr); - - while (nr--) { - dirfrag_t r; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - - ::_decode(realms[r].inodes, payload, off); - ::_decode(realms[r].dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator q = realms.begin(); - q != realms.end(); - ++q) { - payload.append((char*)&q->first, sizeof(q->first)); - - ::_encode(q->second.inodes, payload); - ::_encode(q->second.dirs, payload); - - int n = q->second.dentries.size(); - payload.append((char*)&n, sizeof(n)); - for (map >::iterator p = q->second.dentries.begin(); - p != q->second.dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientFileCaps.h b/branches/sage/pgs/messages/MClientFileCaps.h deleted file mode 100644 index c584fd63f5a1f..0000000000000 --- a/branches/sage/pgs/messages/MClientFileCaps.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#include "msg/Message.h" -#include "mds/Capability.h" - -class MClientFileCaps : public Message { - public: - static const int OP_GRANT = 0; // mds->client grant. - static const int OP_ACK = 1; // client->mds ack (if prior grant was a recall) - static const int OP_RELEASE = 2; // mds closed the cap - static const int OP_STALE = 3; // mds has exported the cap - static const int OP_REAP = 4; // mds has imported the cap from get_mds() - static const char* get_opname(int op) { - switch (op) { - case OP_GRANT: return "grant"; - case OP_ACK: return "ack"; - case OP_RELEASE: return "release"; - case OP_STALE: return "stale"; - case OP_REAP: return "reap"; - default: assert(0); return 0; - } - } - - private: - int32_t op; - inode_t inode; - capseq_t seq; - int32_t caps; - int32_t wanted; - - int32_t mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - long get_seq() { return seq; } - - // for cap migration - int get_mds() { return mds; } - int get_op() { return op; } - - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_op(int s) { op = s; } - - MClientFileCaps() {} - MClientFileCaps(int op_, - inode_t& inode_, - long seq_, - int caps_, - int wanted_, - int mds_=0) : - Message(MSG_CLIENT_FILECAPS), - op(op_), - inode(inode_), - seq(seq_), - caps(caps_), - wanted(wanted_), - mds(mds_) { } - - char *get_type_name() { return "Cfcap";} - void print(ostream& out) { - out << "client_file_caps(" << get_opname(op) - << " " << inode.ino - << " seq " << seq - << " caps " << cap_string(caps) - << " wanted" << cap_string(wanted) - << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - ::_decode(inode, payload, off); - ::_decode(caps, payload, off); - ::_decode(wanted, payload, off); - ::_decode(mds, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - ::_encode(inode, payload); - ::_encode(caps, payload); - ::_encode(wanted, payload); - ::_encode(mds, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientMount.h b/branches/sage/pgs/messages/MClientMount.h deleted file mode 100644 index d083d72833830..0000000000000 --- a/branches/sage/pgs/messages/MClientMount.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { -public: - entity_addr_t addr; - - MClientMount() : Message(MSG_CLIENT_MOUNT) { } - MClientMount(entity_addr_t a) : - Message(MSG_CLIENT_MOUNT), - addr(a) { } - - char *get_type_name() { return "client_mount"; } - - void decode_payload() { - int off = 0; - ::_decode(addr, payload, off); - } - void encode_payload() { - ::_encode(addr, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientReconnect.h b/branches/sage/pgs/messages/MClientReconnect.h deleted file mode 100644 index bf1fbacd4b75c..0000000000000 --- a/branches/sage/pgs/messages/MClientReconnect.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTRECONNECT_H -#define __MCLIENTRECONNECT_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" - -class MClientReconnect : public Message { -public: - map inode_caps; - map inode_path; - bool closed; // true if this session was closed by the client. - - MClientReconnect() : Message(MSG_CLIENT_RECONNECT), - closed(false) { } - - char *get_type_name() { return "client_reconnect"; } - void print(ostream& out) { - out << "client_reconnect(" << inode_caps.size() << " caps)"; - } - - void add_inode_caps(inodeno_t ino, - int wanted, int issued, - off_t sz, utime_t mt, utime_t at) { - inode_caps[ino] = inode_caps_reconnect_t(wanted, issued, sz, mt, at); - } - void add_inode_path(inodeno_t ino, const string& path) { - inode_path[ino] = path; - } - - void encode_payload() { - ::_encode(closed, payload); - ::_encode(inode_caps, payload); - ::_encode(inode_path, payload); - } - void decode_payload() { - int off = 0; - ::_decode(closed, payload, off); - ::_decode(inode_caps, payload, off); - ::_decode(inode_path, payload, off); - } - -}; - - -#endif diff --git a/branches/sage/pgs/messages/MClientReply.h b/branches/sage/pgs/messages/MClientReply.h deleted file mode 100644 index e88c31ca47400..0000000000000 --- a/branches/sage/pgs/messages/MClientReply.h +++ /dev/null @@ -1,294 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" - -#include "MClientRequest.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of InodeStat's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector of InodeStat*'s. - * - * that's mostly it, i think! - * - */ - -class InodeStat { - - public: - inode_t inode; - string symlink; // symlink content (if symlink) - fragtree_t dirfragtree; - - // mds distribution hints - map dirfrag_auth; - map > dirfrag_dist; - set dirfrag_rep; - - public: - InodeStat() {} - InodeStat(CInode *in, int whoami) : - inode(in->inode) - { - // inode.mask - inode.mask = INODE_MASK_BASE; - if (in->authlock.can_rdlock(0)) inode.mask |= INODE_MASK_AUTH; - if (in->linklock.can_rdlock(0)) inode.mask |= INODE_MASK_LINK; - if (in->filelock.can_rdlock(0)) inode.mask |= INODE_MASK_FILE; - - // symlink content? - if (in->is_symlink()) - symlink = in->symlink; - - // dirfragtree - dirfragtree = in->dirfragtree; - - // dirfrag info - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *dir = *p; - dirfrag_auth[dir->dirfrag().frag] = dir->get_dir_auth().first; - if (dir->is_auth()) - dir->get_dist_spec(dirfrag_dist[dir->dirfrag().frag], whoami); - if (dir->is_rep()) - dirfrag_rep.insert(dir->dirfrag().frag); - } - } - - void _encode(bufferlist &bl) { - ::_encode(inode, bl); - ::_encode(dirfrag_auth, bl); - ::_encode(dirfrag_dist, bl); - ::_encode(dirfrag_rep, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - } - - void _decode(bufferlist &bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(dirfrag_auth, bl, off); - ::_decode(dirfrag_dist, bl, off); - ::_decode(dirfrag_rep, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - } -}; - - -class MClientReply : public Message { - // reply data - struct { - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - uint64_t file_data_version; // for client buffercache consistency - - int _num_trace_in; - int _dir_size; - } st; - - string path; - list trace_in; - list trace_dn; - - list dir_in; - list dir_dn; - - public: - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - const list& get_trace_in() { return trace_in; } - const list& get_trace_dn() { return trace_dn; } - - const list& get_dir_in() { return dir_in; } - const list& get_dir_dn() { return dir_dn; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(uint64_t v) { st.file_data_version = v; } - - MClientReply() {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY) { - memset(&st, 0, sizeof(st)); - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - - st._dir_size = 0; - st._num_trace_in = 0; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - void print(ostream& o) { - o << "creply(" << env.dst.name << "." << st.tid; - o << " = " << st.result; - if (st.result <= 0) - o << " " << strerror(-st.result); - o << ")"; - } - - // serialization - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - _decode(path, payload, off); - - for (int i=0; i_decode(payload, off); - trace_in.push_back(ci); - } - - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - string dn; - ::_decode(dn, payload, off); - dir_dn.push_back(dn); - } - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - _encode(path, payload); - - // trace - list::iterator pdn = trace_dn.begin(); - list::iterator pin; - for (pin = trace_in.begin(); - pin != trace_in.end(); - ++pin) { - if (pin != trace_in.begin()) { - ::_encode(*pdn, payload); - ++pdn; - } - (*pin)->_encode(payload); - } - - // dir contents - pdn = dir_dn.begin(); - for (pin = dir_in.begin(); - pin != dir_in.end(); - ++pin, ++pdn) { - (*pin)->_encode(payload); - ::_encode(*pdn, payload); - } - } - - // builders - /* - void add_dir_item(string& dn, InodeStat *in) { - dir_dn.push_back(dn); - dir_in.push_back(in); - ++st._dir_size; - }*/ - void take_dir_items(list& inls, - list& dnls, - int num) { - dir_in.swap(inls); - dir_dn.swap(dnls); - st._dir_size = num; - } - void copy_dir_items(const list& inls, - const list& dnls) { - list::const_iterator pdn = dnls.begin(); - list::const_iterator pin = inls.begin(); - while (pin != inls.end()) { - // copy! - InodeStat *i = new InodeStat; - *i = **pin; - dir_in.push_back(i); - dir_dn.push_back(*pdn); - ++pin; - ++pdn; - ++st._dir_size; - } - } - - void set_trace_dist(CInode *in, int whoami) { - st._num_trace_in = 0; - while (in) { - // add this inode to trace, along with referring dentry name - if (in->get_parent_dn()) - trace_dn.push_front(in->get_parent_dn()->get_name()); - trace_in.push_front(new InodeStat(in, whoami)); - ++st._num_trace_in; - - in = in->get_parent_inode(); - } - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientRequest.h b/branches/sage/pgs/messages/MClientRequest.h deleted file mode 100644 index 805bf562c062c..0000000000000 --- a/branches/sage/pgs/messages/MClientRequest.h +++ /dev/null @@ -1,315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * fixed size arguments are in a union. - * there's also a string argument, for e.g. symlink(). - * - */ - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" - -#include -#include -#include -#include -#include - - -// metadata ops. -// >=1000 --> an update, non-idempotent (i.e. an update) -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_FSTAT 102 -#define MDS_OP_UTIME 1102 -#define MDS_OP_CHMOD 1104 -#define MDS_OP_CHOWN 1105 - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 1201 -#define MDS_OP_LINK 1202 -#define MDS_OP_UNLINK 1203 -#define MDS_OP_RENAME 1204 - -#define MDS_OP_MKDIR 1220 -#define MDS_OP_RMDIR 1221 -#define MDS_OP_SYMLINK 1222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 1306 -#define MDS_OP_FSYNC 307 - -#define MDS_OP_RELEASE 308 // used only by SyntheticClient op_dist thinger - - -class MClientRequest : public Message { - struct { - tid_t tid; - tid_t oldest_client_tid; - int num_fwd; - int retry_attempt; - inodeno_t mds_wants_replica_in_dirino; - - entity_inst_t client_inst; - - int op; - int caller_uid, caller_gid; - inodeno_t cwd_ino; - } st; - - // path arguments - filepath path; - string sarg; - - public: - // fixed size arguments. in a union. - // note: nothing with a constructor can go here; use underlying base - // types for _inodeno_t, _frag_t. - union { - struct { - int mask; - } stat; - struct { - _inodeno_t ino; - int mask; - } fstat; - struct { - _frag_t frag; - } readdir; - struct { - _utime_t mtime; - _utime_t atime; - } utime; - struct { - mode_t mode; - } chmod; - struct { - uid_t uid; - gid_t gid; - } chown; - struct { - mode_t mode; - } mknod; - struct { - mode_t mode; - } mkdir; - struct { - int flags; - mode_t mode; - } open; - struct { - _inodeno_t ino; // optional - off_t length; - } truncate; - struct { - _inodeno_t ino; - } fsync; - } args; - - // cons - MClientRequest() : Message(MSG_CLIENT_REQUEST) {} - MClientRequest(int op, entity_inst_t ci) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - memset(&args, 0, sizeof(args)); - this->st.op = op; - this->st.client_inst = ci; - } - - metareqid_t get_reqid() { - // FIXME: for now, assume clients always have 1 incarnation - return metareqid_t(st.client_inst.name.num(), st.tid); - } - - int get_open_file_mode() { - if (args.open.flags & O_LAZY) - return FILE_MODE_LAZY; - if (args.open.flags & O_WRONLY) - return FILE_MODE_W; - if (args.open.flags & O_RDWR) - return FILE_MODE_RW; - if (args.open.flags & O_APPEND) - return FILE_MODE_W; - return FILE_MODE_R; - } - bool open_file_mode_is_readonly() { - return get_open_file_mode() == FILE_MODE_R; - } - bool is_idempotent() { - if (st.op == MDS_OP_OPEN) - return open_file_mode_is_readonly(); - return (st.op < 1000); - } - bool auth_is_best() { - if (!is_idempotent()) return true; - if (st.op == MDS_OP_READDIR) return true; - return false; - } - bool follow_trailing_symlink() { - switch (st.op) { - case MDS_OP_LSTAT: - case MDS_OP_LINK: - case MDS_OP_UNLINK: - case MDS_OP_RENAME: - return false; - - case MDS_OP_STAT: - case MDS_OP_UTIME: - case MDS_OP_CHMOD: - case MDS_OP_CHOWN: - case MDS_OP_READDIR: - case MDS_OP_OPEN: - return true; - - default: - assert(0); - } - } - - - - // normal fields - void set_tid(tid_t t) { st.tid = t; } - void set_oldest_client_tid(tid_t t) { st.oldest_client_tid = t; } - void inc_num_fwd() { st.num_fwd++; } - void set_retry_attempt(int a) { st.retry_attempt = a; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_path(const filepath& fp) { path = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_sarg(string& arg) { this->sarg = arg; } - void set_sarg(const char *arg) { this->sarg = arg; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - - int get_client() { return st.client_inst.name.num(); } - tid_t get_tid() { return st.tid; } - tid_t get_oldest_client_tid() { return st.oldest_client_tid; } - int get_num_fwd() { return st.num_fwd; } - int get_retry_attempt() { return st.retry_attempt; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - //inodeno_t get_ino() { return st.ino; } - const string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - string& get_sarg() { return sarg; } - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - inodeno_t get_cwd_ino() { return st.cwd_ino ? st.cwd_ino:inodeno_t(MDS_INO_ROOT); } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - payload.copy(off, sizeof(args), (char*)&args); - off += sizeof(args); - path._decode(payload, off); - ::_decode(sarg, payload, off); - } - - void encode_payload() { - payload.append((char*)&st, sizeof(st)); - payload.append((char*)&args, sizeof(args)); - path._encode(payload); - ::_encode(sarg, payload); - } - - char *get_type_name() { return "creq"; } - void print(ostream& out) { - out << "clientreq(client" << get_client() - << "." << get_tid() - << " "; - switch(get_op()) { - case MDS_OP_STATFS: - out << "statfs"; break; - - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_FSTAT: - out << "fstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - // case MDS_OP_RELEASE: - //out << "release"; break; - default: - out << "unknown=" << get_op(); - assert(0); - } - if (get_path().length()) - out << " " << get_path(); - if (get_sarg().length()) - out << " " << get_sarg(); - if (st.retry_attempt) - out << " RETRY=" << st.retry_attempt; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientRequestForward.h b/branches/sage/pgs/messages/MClientRequestForward.h deleted file mode 100644 index c81e3b3c06ce8..0000000000000 --- a/branches/sage/pgs/messages/MClientRequestForward.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUESTFORWARD_H -#define __MCLIENTREQUESTFORWARD_H - -class MClientRequestForward : public Message { - tid_t tid; - int dest_mds; - int num_fwd; - - public: - MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} - MClientRequestForward(tid_t t, int dm, int nf) : - Message(MSG_CLIENT_REQUEST_FORWARD), - tid(t), dest_mds(dm), num_fwd(nf) { } - - tid_t get_tid() { return tid; } - int get_dest_mds() { return dest_mds; } - int get_num_fwd() { return num_fwd; } - - char *get_type_name() { return "cfwd"; } - void print(ostream& o) { - o << "client_request_forward(" << tid - << " to " << dest_mds - << " num_fwd=" << num_fwd - << ")"; - } - - void encode_payload() { - payload.append((char*)&tid, sizeof(tid)); - payload.append((char*)&dest_mds, sizeof(dest_mds)); - payload.append((char*)&num_fwd, sizeof(num_fwd)); - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - payload.copy(off, sizeof(dest_mds), (char*)&dest_mds); - off += sizeof(dest_mds); - payload.copy(off, sizeof(num_fwd), (char*)&num_fwd); - off += sizeof(num_fwd); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientSession.h b/branches/sage/pgs/messages/MClientSession.h deleted file mode 100644 index c84eadbccb117..0000000000000 --- a/branches/sage/pgs/messages/MClientSession.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTSESSION_H -#define __MCLIENTSESSION_H - -#include "msg/Message.h" - -class MClientSession : public Message { -public: - const static int OP_REQUEST_OPEN = 1; - const static int OP_OPEN = 2; - const static int OP_REQUEST_CLOSE = 3; - const static int OP_CLOSE = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_REQUEST_OPEN: return "request_open"; - case OP_OPEN: return "open"; - case OP_REQUEST_CLOSE: return "request_close"; - case OP_CLOSE: return "close"; - default: assert(0); - } - } - - int32_t op; - version_t seq; - - MClientSession() : Message(MSG_CLIENT_SESSION) { } - MClientSession(int o, version_t s=0) : - Message(MSG_CLIENT_SESSION), - op(o), seq(s) { } - - char *get_type_name() { return "client_session"; } - void print(ostream& out) { - out << "client_session(" << get_opname(op); - if (seq) out << " seq " << seq; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MClientUnmount.h b/branches/sage/pgs/messages/MClientUnmount.h deleted file mode 100644 index 42fa07db7ba05..0000000000000 --- a/branches/sage/pgs/messages/MClientUnmount.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTUNMOUNT_H -#define __MCLIENTUNMOUNT_H - -#include "msg/Message.h" - -class MClientUnmount : public Message { -public: - entity_inst_t inst; - - MClientUnmount() : Message(MSG_CLIENT_UNMOUNT) { } - MClientUnmount(entity_inst_t i) : - Message(MSG_CLIENT_UNMOUNT), - inst(i) { } - - char *get_type_name() { return "client_unmount"; } - - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - } - void encode_payload() { - ::_encode(inst, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MDentryUnlink.h b/branches/sage/pgs/messages/MDentryUnlink.h deleted file mode 100644 index 6e24d6f45410f..0000000000000 --- a/branches/sage/pgs/messages/MDentryUnlink.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - dirfrag_t dirfrag; - string dn; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_dn() { return dn; } - - CInodeDiscover *strayin; - CDirDiscover *straydir; - CDentryDiscover *straydn; - - MDentryUnlink() : - Message(MSG_MDS_DENTRYUNLINK), - strayin(0), straydir(0), straydn(0) { } - MDentryUnlink(dirfrag_t df, string& n) : - Message(MSG_MDS_DENTRYUNLINK), - dirfrag(df), - dn(n), - strayin(0), straydir(0), straydn(0) { } - ~MDentryUnlink() { - delete strayin; - delete straydir; - delete straydn; - } - - char *get_type_name() { return "dentry_unlink";} - void print(ostream& o) { - o << "dentry_unlink(" << dirfrag << " " << dn << ")"; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(dn, payload, off); - - bool isstray; - payload.copy(off, sizeof(isstray), (char*)&isstray); - off += sizeof(isstray); - if (isstray) { - strayin = new CInodeDiscover; - strayin->_decode(payload, off); - straydir = new CDirDiscover; - straydir->_decode(payload, off); - straydn = new CDentryDiscover; - straydn->_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&dirfrag,sizeof(dirfrag)); - ::_encode(dn, payload); - - bool isstray = strayin ? true:false; - payload.append((char*)&isstray, sizeof(isstray)); - if (isstray) { - strayin->_encode(payload); - straydir->_encode(payload); - straydn->_encode(payload); - } - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MDirUpdate.h b/branches/sage/pgs/messages/MDirUpdate.h deleted file mode 100644 index 0db32208efd45..0000000000000 --- a/branches/sage/pgs/messages/MDirUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -class MDirUpdate : public Message { - struct { - dirfrag_t dirfrag; - int dir_rep; - int discover; - } st; - set dir_rep_by; - string path; - - public: - dirfrag_t get_dirfrag() { return st.dirfrag; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - string& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(dirfrag_t dirfrag, - int dir_rep, - set& dir_rep_by, - string& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.dirfrag = dirfrag; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dir_update"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(dir_rep_by, payload, off); - ::_decode(path, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(dir_rep_by, payload); - ::_encode(path, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MDiscover.h b/branches/sage/pgs/messages/MDiscover.h deleted file mode 100644 index 5917c719a8af4..0000000000000 --- a/branches/sage/pgs/messages/MDiscover.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 1 -> root - frag_t base_dir_frag; - - filepath want; // ... [/]need/this/stuff - inodeno_t want_ino; - - bool want_base_dir; - bool want_xlocked; - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - filepath& get_want() { return want; } - inodeno_t get_want_ino() { return want_ino; } - const string& get_dentry(int n) { return want[n]; } - - bool wants_base_dir() { return want_base_dir; } - bool wants_xlocked() { return want_xlocked; } - - void set_base_dir_frag(frag_t f) { base_dir_frag = f; } - - MDiscover() { } - MDiscover(int asker_, - inodeno_t base_ino_, - filepath& want_, - bool want_base_dir_ = true, - bool discover_xlocks_ = false) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_ino_), - want(want_), - want_ino(0), - want_base_dir(want_base_dir_), - want_xlocked(discover_xlocks_) { } - MDiscover(int asker_, - dirfrag_t base_dirfrag, - inodeno_t want_ino_, - bool want_base_dir_ = true) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_dirfrag.ino), - base_dir_frag(base_dirfrag.frag), - want_ino(want_ino_), - want_base_dir(want_base_dir_), - want_xlocked(false) { } - - char *get_type_name() { return "Dis"; } - void print(ostream &out) { - out << "discover(" << base_ino << "." << base_dir_frag - << " " << want; - if (want_ino) out << want_ino; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - want._decode(payload, off); - ::_decode(want_ino, payload, off); - ::_decode(want_base_dir, payload, off); - ::_decode(want_xlocked, payload, off); - } - void encode_payload() { - ::_encode(asker, payload); - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - want._encode(payload); - ::_encode(want_ino, payload); - ::_encode(want_base_dir, payload); - ::_encode(want_xlocked, payload); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MDiscoverReply.h b/branches/sage/pgs/messages/MDiscoverReply.h deleted file mode 100644 index 5821bc85db38e..0000000000000 --- a/branches/sage/pgs/messages/MDiscoverReply.h +++ /dev/null @@ -1,276 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * and sometimes, - * dir_auth_hint - where we think the dir auth is - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - * - * so basically, we get - * - * dir den ino i - * x 0 - * x x x 1 - * or - * x x 0 - * x x x 1 - * or - * x x x 0 - * x x x 1 - * ...and trail off however we want. - * - * - */ - -class MDiscoverReply : public Message { - inodeno_t base_ino; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - bool flag_error_dn; - bool flag_error_ino; - bool flag_error_dir; - string error_dentry; // dentry that was not found (to trigger waiters on asker) - int dir_auth_hint; - bool wanted_xlocks_hint; - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - string path; - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_last_inode() { return inodes.size(); } - int get_last_dentry() { return dentries.size() + no_base_dentry; } - int get_last_dir() { return dirs.size() + no_base_dir; } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_base_inode() { return no_base_dir && no_base_dentry; } - - const string& get_path() { return path; } - - // bool is_flag_forward() { return flag_forward; } - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_ino() { return flag_error_ino; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - int get_dir_auth_hint() { return dir_auth_hint; } - bool get_wanted_xlocks_hint() { return wanted_xlocks_hint; } - - void set_wanted_xlocks_hint(bool w) { wanted_xlocks_hint = w; } - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(inodeno_t base_ino) : - Message(MSG_MDS_DISCOVERREPLY) { - this->base_ino = base_ino; - flag_error_dn = false; - flag_error_dir = false; - no_base_dir = no_base_dentry = false; - dir_auth_hint = CDIR_AUTH_UNKNOWN; - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_dir && - dir_auth_hint == CDIR_AUTH_UNKNOWN; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - if (path.length()) path += "/"; - path += ddis->get_dname(); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_ino() { - flag_error_ino = true; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - void set_dir_auth_hint(int a) { - dir_auth_hint = a; - } - void set_error_dentry(const string& dn) { - error_dentry = dn; - } - - - // ... - virtual void decode_payload() { - int off = 0; - ::_decode(base_ino, payload, off); - ::_decode(no_base_dir, payload, off); - ::_decode(no_base_dentry, payload, off); - ::_decode(flag_error_dn, payload, off); - ::_decode(flag_error_ino, payload, off); - ::_decode(flag_error_dir, payload, off); - ::_decode(error_dentry, payload, off); - ::_decode(dir_auth_hint, payload, off); - ::_decode(wanted_xlocks_hint, payload, off); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " dirs out" << endl; - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " inodes out" << endl; - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - ::_encode(base_ino, payload); - ::_encode(no_base_dir, payload); - ::_encode(no_base_dentry, payload); - ::_encode(flag_error_dn, payload); - ::_encode(flag_error_ino, payload); - ::_encode(flag_error_dir, payload); - ::_encode(error_dentry, payload); - ::_encode(dir_auth_hint, payload); - ::_encode(wanted_xlocks_hint, payload); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dirs in" << endl; - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " inodes in" << endl; - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dentries in" << endl; - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDir.h b/branches/sage/pgs/messages/MExportDir.h deleted file mode 100644 index 8fafbe0312636..0000000000000 --- a/branches/sage/pgs/messages/MExportDir.h +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - dirfrag_t dirfrag; - - list dirstate; // a bl for reach dir - list bounds; - - public: - MExportDir() {} - MExportDir(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIR), - dirfrag(df) { - } - virtual char *get_type_name() { return "Ex"; } - void print(ostream& o) { - o << "export(" << dirfrag << ")"; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - list& get_dirstate() { return dirstate; } - list& get_bounds() { return bounds; } - - void add_dir(bufferlist& dir) { - dirstate.push_back(dir); - } - void set_dirstate(const list& ls) { - dirstate = ls; - } - void add_export(dirfrag_t df) { - bounds.push_back(df); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(bounds, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(bounds, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirAck.h b/branches/sage/pgs/messages/MExportDirAck.h deleted file mode 100644 index 1b9d683b4e36f..0000000000000 --- a/branches/sage/pgs/messages/MExportDirAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirAck() {} - MExportDirAck(dirfrag_t i) : - Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } - - virtual char *get_type_name() { return "ExAck"; } - void print(ostream& o) { - o << "export_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirCancel.h b/branches/sage/pgs/messages/MExportDirCancel.h deleted file mode 100644 index f13ee1a44fa21..0000000000000 --- a/branches/sage/pgs/messages/MExportDirCancel.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRCANCEL_H -#define __MEXPORTDIRCANCEL_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirCancel : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirCancel() {} - MExportDirCancel(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRCANCEL), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExCancel"; } - void print(ostream& o) { - o << "export_cancel(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirDiscover.h b/branches/sage/pgs/messages/MExportDirDiscover.h deleted file mode 100644 index 7375fad6c5057..0000000000000 --- a/branches/sage/pgs/messages/MExportDirDiscover.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - dirfrag_t dirfrag; - string path; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_path() { return path; } - - bool started; - - MExportDirDiscover() : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { } - MExportDirDiscover(CDir *dir) : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { - dir->get_inode()->make_path(path); - dirfrag = dir->dirfrag(); - } - virtual char *get_type_name() { return "ExDis"; } - void print(ostream& o) { - o << "export_discover(" << dirfrag << " " << path << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(path, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(path, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirDiscoverAck.h b/branches/sage/pgs/messages/MExportDirDiscoverAck.h deleted file mode 100644 index 5e1924bc57e38..0000000000000 --- a/branches/sage/pgs/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - dirfrag_t dirfrag; - bool success; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(dirfrag_t df, bool s=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK), - dirfrag(df), - success(s) { } - - virtual char *get_type_name() { return "ExDisA"; } - void print(ostream& o) { - o << "export_discover_ack(" << dirfrag; - if (success) - o << " success)"; - else - o << " failure)"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirFinish.h b/branches/sage/pgs/messages/MExportDirFinish.h deleted file mode 100644 index 03f5e1fcc9ef3..0000000000000 --- a/branches/sage/pgs/messages/MExportDirFinish.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "msg/Message.h" - -class MExportDirFinish : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirFinish() {} - MExportDirFinish(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExFin"; } - void print(ostream& o) { - o << "export_finish(" << dirfrag << ")"; - } - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirNotify.h b/branches/sage/pgs/messages/MExportDirNotify.h deleted file mode 100644 index c7a79a64f9317..0000000000000 --- a/branches/sage/pgs/messages/MExportDirNotify.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - dirfrag_t base; - bool ack; - pair old_auth, new_auth; - list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) - - public: - dirfrag_t get_dirfrag() { return base; } - pair get_old_auth() { return old_auth; } - pair get_new_auth() { return new_auth; } - bool wants_ack() { return ack; } - list& get_bounds() { return bounds; } - - MExportDirNotify() {} - MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : - Message(MSG_MDS_EXPORTDIRNOTIFY), - base(i), ack(a), old_auth(oa), new_auth(na) { } - - virtual char *get_type_name() { return "ExNot"; } - void print(ostream& o) { - o << "export_notify(" << base; - o << " " << old_auth << " -> " << new_auth; - if (ack) - o << " ack)"; - else - o << " no ack)"; - } - - void copy_bounds(list& ex) { - this->bounds = ex; - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back(*i); - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back((*i)->dirfrag()); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(old_auth), (char*)&old_auth); - off += sizeof(old_auth); - payload.copy(off, sizeof(new_auth), (char*)&new_auth); - off += sizeof(new_auth); - ::_decode(bounds, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&base, sizeof(base)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&old_auth, sizeof(old_auth)); - payload.append((char*)&new_auth, sizeof(new_auth)); - ::_encode(bounds, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirNotifyAck.h b/branches/sage/pgs/messages/MExportDirNotifyAck.h deleted file mode 100644 index 6a41aee83b5f3..0000000000000 --- a/branches/sage/pgs/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExNotA"; } - void print(ostream& o) { - o << "export_notify_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirPrep.h b/branches/sage/pgs/messages/MExportDirPrep.h deleted file mode 100644 index 8d54276f0bd83..0000000000000 --- a/branches/sage/pgs/messages/MExportDirPrep.h +++ /dev/null @@ -1,189 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - dirfrag_t dirfrag; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list bounds; - - list inodes; - map inode_dirfrag; - map inode_dentry; - - map > frags_by_ino; - map dirfrags; - - set bystanders; - - bool b_did_assim; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - list& get_bounds() { return bounds; } - list& get_inodes() { return inodes; } - list& get_inode_dirfrags(inodeno_t ino) { - return frags_by_ino[ino]; - } - dirfrag_t get_containing_dirfrag(inodeno_t ino) { - return inode_dirfrag[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dirfrag(dirfrag_t df) { - return dirfrags.count(df); - } - CDirDiscover* get_dirfrag_discover(dirfrag_t df) { - return dirfrags[df]; - } - set &get_bystanders() { return bystanders; } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREP), - dirfrag(df), - b_did_assim(false) { } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - void print(ostream& o) { - o << "export_prep(" << dirfrag << ")"; - } - - void add_export(dirfrag_t df) { - bounds.push_back( df ); - } - void add_inode(dirfrag_t df, const string& dentry, CInodeDiscover *in) { - inodes.push_back(in); - inode_dirfrag[in->get_ino()] = df; - inode_dentry[in->get_ino()] = dentry; - } - void add_dirfrag(CDirDiscover *dir) { - dirfrags[dir->get_dirfrag()] = dir; - frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); - } - void add_bystander(int who) { - bystanders.insert(who); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - - ::_decode(bounds, payload, off); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - dirfrag_t df; - payload.copy(off, sizeof(df), (char*)&df); - off += sizeof(df); - inode_dirfrag[in->get_ino()] = df; - - // child frags - ::_decode(frags_by_ino[in->get_ino()], payload, off); - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirfrags[dir->get_dirfrag()] = dir; - } - - ::_decode(bystanders, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - - ::_encode(bounds, payload); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - (*iit)->_encode(payload); - - // dentry - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; - payload.append((char*)&df, sizeof(df)); - - // child frags - ::_encode(frags_by_ino[(*iit)->get_ino()], payload); - } - - // dirs - int nd = dirfrags.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - dit->second->_encode(payload); - - ::_encode(bystanders, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirPrepAck.h b/branches/sage/pgs/messages/MExportDirPrepAck.h deleted file mode 100644 index 355541e9f1b5c..0000000000000 --- a/branches/sage/pgs/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirPrepAck() {} - MExportDirPrepAck(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREPACK), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExPAck"; } - void print(ostream& o) { - o << "export_prep_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirWarning.h b/branches/sage/pgs/messages/MExportDirWarning.h deleted file mode 100644 index b59e2eb12251c..0000000000000 --- a/branches/sage/pgs/messages/MExportDirWarning.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - int new_dir_auth; - - public: - inodeno_t get_ino() { return ino; } - int get_new_dir_auth() { return new_dir_auth; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t i, int nda) : - Message(MSG_MDS_EXPORTDIRWARNING), - ino(i), new_dir_auth(nda) {} - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(new_dir_auth), (char*)&new_dir_auth); - off += sizeof(new_dir_auth); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&new_dir_auth, sizeof(new_dir_auth)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MExportDirWarningAck.h b/branches/sage/pgs/messages/MExportDirWarningAck.h deleted file mode 100644 index 7ee3078e61973..0000000000000 --- a/branches/sage/pgs/messages/MExportDirWarningAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNINGACK_H -#define __MEXPORTDIRWARNINGACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarningAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarningAck() {} - MExportDirWarningAck(inodeno_t i) : - Message(MSG_MDS_EXPORTDIRWARNINGACK), - ino(i) {} - - virtual char *get_type_name() { return "ExWAck"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MGenericMessage.h b/branches/sage/pgs/messages/MGenericMessage.h deleted file mode 100644 index fee4e014edaf8..0000000000000 --- a/branches/sage/pgs/messages/MGenericMessage.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MHeartbeat.h b/branches/sage/pgs/messages/MHeartbeat.h deleted file mode 100644 index 964f2a3bd49f2..0000000000000 --- a/branches/sage/pgs/messages/MHeartbeat.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - payload.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - ::_decode(import_map, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&load, sizeof(load)); - payload.append((char*)&beat, sizeof(beat)); - ::_encode(import_map, payload); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MInodeFileCaps.h b/branches/sage/pgs/messages/MInodeFileCaps.h deleted file mode 100644 index 05ade1094c9c8..0000000000000 --- a/branches/sage/pgs/messages/MInodeFileCaps.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload() { - payload.append((char*)&from, sizeof(from)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MLock.h b/branches/sage/pgs/messages/MLock.h deleted file mode 100644 index 62f5b174de702..0000000000000 --- a/branches/sage/pgs/messages/MLock.h +++ /dev/null @@ -1,128 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" -#include "mds/SimpleLock.h" - -// for replicas -#define LOCK_AC_SYNC -1 -#define LOCK_AC_MIXED -2 -#define LOCK_AC_LOCK -3 - -#define LOCK_AC_SCATTER -6 - -// for auth -#define LOCK_AC_SYNCACK 1 -#define LOCK_AC_MIXEDACK 2 -#define LOCK_AC_LOCKACK 3 - -#define LOCK_AC_REQSCATTER 7 - -#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) -#define LOCK_AC_FOR_AUTH(a) ((a) > 0) - - -static const char *get_lock_action_name(int a) { - switch (a) { - case LOCK_AC_SYNC: return "sync"; - case LOCK_AC_MIXED: return "mixed"; - case LOCK_AC_LOCK: return "lock"; - case LOCK_AC_SCATTER: return "scatter"; - case LOCK_AC_SYNCACK: return "syncack"; - case LOCK_AC_MIXEDACK: return "mixedack"; - case LOCK_AC_LOCKACK: return "lockack"; - case LOCK_AC_REQSCATTER: return "reqscatter"; - default: assert(0); - } -} - - -class MLock : public Message { - int asker; // who is initiating this request - int action; // action type - metareqid_t reqid; // for remote lock requests - - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - bufferlist data; // and possibly some data - - public: - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - metareqid_t get_reqid() { return reqid; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - MLock() {} - MLock(int action, int asker) : - Message(MSG_MDS_LOCK) { - this->action = action; - this->asker = asker; - } - MLock(SimpleLock *lock, int action, int asker) : - Message(MSG_MDS_LOCK) { - this->lock_type = lock->get_type(); - lock->get_parent()->set_object_info(object_info); - this->action = action; - this->asker = asker; - } - MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) : - Message(MSG_MDS_LOCK) { - this->lock_type = lock->get_type(); - lock->get_parent()->set_object_info(object_info); - this->action = action; - this->asker = asker; - data.claim(bl); - } - virtual char *get_type_name() { return "ILock"; } - void print(ostream& out) { - out << "lock(a=" << get_lock_action_name(action) - << " " << get_lock_type_name(lock_type) - << " " << object_info - << ")"; - } - - void set_reqid(metareqid_t ri) { reqid = ri; } - void set_data(const bufferlist& data) { - this->data = data; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(action, payload, off); - ::_decode(reqid, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - ::_encode(asker, payload); - ::_encode(action, payload); - ::_encode(reqid, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSBeacon.h b/branches/sage/pgs/messages/MMDSBeacon.h deleted file mode 100644 index d8b73a45a3122..0000000000000 --- a/branches/sage/pgs/messages/MMDSBeacon.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - entity_inst_t inst; - int state; - version_t seq; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, int st, version_t se) : - Message(MSG_MDS_BEACON), - inst(i), state(st), seq(se) { } - - entity_inst_t& get_mds_inst() { return inst; } - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - - void print(ostream& out) { - out << "mdsbeacon(" << inst - << " " << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(state, payload); - ::_encode(seq, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(state, payload, off); - ::_decode(seq, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSBoot.h b/branches/sage/pgs/messages/MMDSBoot.h deleted file mode 100644 index 8529578e29d56..0000000000000 --- a/branches/sage/pgs/messages/MMDSBoot.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSCacheRejoin.h b/branches/sage/pgs/messages/MMDSCacheRejoin.h deleted file mode 100644 index c0303fd1af455..0000000000000 --- a/branches/sage/pgs/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,237 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "include/encodable.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - static const int OP_WEAK = 1; // replica -> auth, i exist, + maybe open files. - static const int OP_STRONG = 2; // replica -> auth, i exist, + open files and lock state. - static const int OP_ACK = 3; // auth -> replica, here is your lock state. - static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. - static const int OP_MISSING = 5; // auth -> replica, i am missing these items - static const int OP_FULL = 6; // replica -> auth, here is the full object. - static const char *get_opname(int op) { - switch (op) { - case OP_WEAK: return "weak"; - case OP_STRONG: return "strong"; - case OP_ACK: return "ack"; - case OP_MISSING: return "missing"; - case OP_FULL: return "full"; - default: assert(0); - } - } - - // -- types -- - struct inode_strong { - int32_t caps_wanted; - int32_t nonce; - int32_t authlock; - int32_t linklock; - int32_t dirfragtreelock; - int32_t filelock; - __int32_t dirlock; - inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : - caps_wanted(cw), - nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } - }; - struct inode_full { - inode_t inode; - string symlink; - fragtree_t dirfragtree; - inode_full() {} - inode_full(const inode_t& i, const string& s, const fragtree_t& f) : - inode(i), symlink(s), dirfragtree(f) {} - - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - ::_decode(dirfragtree, bl, off); - } - void _encode(bufferlist& bl) const { - ::_encode(inode, bl); - ::_encode(symlink, bl); - ::_encode(dirfragtree, bl); - } - }; - - struct dirfrag_strong { - int32_t nonce; - dirfrag_strong() {} - dirfrag_strong(int n) : nonce(n) {} - }; - struct dn_strong { - inodeno_t ino; - inodeno_t remote_ino; - int32_t nonce; - int32_t lock; - dn_strong() : ino(0), remote_ino(0), nonce(0), lock(0) {} - dn_strong(inodeno_t pi, inodeno_t ri, int n, int l) : - ino(pi), remote_ino(ri), nonce(n), lock(l) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } - }; - - struct dn_weak { - inodeno_t ino; - inodeno_t remote_ino; - dn_weak() : ino(0), remote_ino(0) {} - dn_weak(inodeno_t pi, inodeno_t ri) : ino(pi), remote_ino(ri) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } - }; - - // -- data -- - int32_t op; - - // weak - map > weak; - set weak_inodes; - - // strong - map strong_dirfrags; - map > strong_dentries; - map strong_inodes; - - // open - bufferlist cap_export_bl; - map > cap_exports; - map cap_export_paths; - - // full - list full_inodes; - - // authpins, xlocks - map authpinned_inodes; - map > xlocked_inodes; - map > authpinned_dentries; - map > xlocked_dentries; - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - MMDSCacheRejoin(int o) : - Message(MSG_MDS_CACHEREJOIN), - op(o) {} - - char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin " << get_opname(op); - } - - // -- builders -- - // inodes - void add_weak_inode(inodeno_t i) { - weak_inodes.insert(i); - } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); - } - void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { - full_inodes.push_back(inode_full(i, s, f)); - } - void add_inode_authpin(inodeno_t ino, const metareqid_t& ri) { - authpinned_inodes[ino] = ri; - } - void add_inode_xlock(inodeno_t ino, int lt, const metareqid_t& ri) { - xlocked_inodes[ino][lt] = ri; - } - - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - - // dirfrags - void add_weak_dirfrag(dirfrag_t df) { - weak[df]; - } - void add_weak_dirfrag(dirfrag_t df, map& dnmap) { - weak[df] = dnmap; - } - void add_strong_dirfrag(dirfrag_t df, int n) { - strong_dirfrags[df] = dirfrag_strong(n); - } - - // dentries - void add_weak_dentry(dirfrag_t df, const string& dname, dn_weak& dnw) { - weak[df][dname] = dnw; - } - void add_weak_null_dentry(dirfrag_t df, const string& dname) { - weak[df][dname] = dn_weak(0, 0); - } - void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino, 0); - } - void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(0, ino); - } - void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, int n, int ls) { - strong_dentries[df][dname] = dn_strong(pi, ri, n, ls); - } - void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) { - authpinned_dentries[df][dname] = ri; - } - void add_dentry_xlock(dirfrag_t df, const string& dname, const metareqid_t& ri) { - xlocked_dentries[df][dname] = ri; - } - - // -- encoding -- - void encode_payload() { - ::_encode(op, payload); - ::_encode(strong_inodes, payload); - ::_encode_complex(full_inodes, payload); - ::_encode(authpinned_inodes, payload); - ::_encode(xlocked_inodes, payload); - ::_encode(cap_export_bl, payload); - ::_encode(strong_dirfrags, payload); - ::_encode(weak, payload); - ::_encode(weak_inodes, payload); - ::_encode(strong_dentries, payload); - ::_encode(authpinned_dentries, payload); - ::_encode(xlocked_dentries, payload); - } - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(strong_inodes, payload, off); - ::_decode_complex(full_inodes, payload, off); - ::_decode(authpinned_inodes, payload, off); - ::_decode(xlocked_inodes, payload, off); - ::_decode(cap_export_bl, payload, off); - if (cap_export_bl.length()) { - int off = 0; - ::_decode(cap_exports, cap_export_bl, off); - ::_decode(cap_export_paths, cap_export_bl, off); - } - ::_decode(strong_dirfrags, payload, off); - ::_decode(weak, payload, off); - ::_decode(weak_inodes, payload, off); - ::_decode(strong_dentries, payload, off); - ::_decode(authpinned_dentries, payload, off); - ::_decode(xlocked_dentries, payload, off); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSGetMap.h b/branches/sage/pgs/messages/MMDSGetMap.h deleted file mode 100644 index eab9a3506a40b..0000000000000 --- a/branches/sage/pgs/messages/MMDSGetMap.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSGetMap : public Message { - public: - MMDSGetMap() : Message(MSG_MDS_GETMAP) { - } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSMap.h b/branches/sage/pgs/messages/MMDSMap.h deleted file mode 100644 index 164e547cc513a..0000000000000 --- a/branches/sage/pgs/messages/MMDSMap.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - char *get_type_name() { return "mdsmap"; } - void print(ostream& out) { - out << "mdsmap(e " << epoch << ")"; - } - - // marshalling - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(encoded, payload, off); - } - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(encoded, payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSResolve.h b/branches/sage/pgs/messages/MMDSResolve.h deleted file mode 100644 index 2103a0115081d..0000000000000 --- a/branches/sage/pgs/messages/MMDSResolve.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVE_H -#define __MMDSRESOLVE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSResolve : public Message { - public: - map > subtrees; - map > ambiguous_imports; - list slave_requests; - - MMDSResolve() : Message(MSG_MDS_RESOLVE) {} - - char *get_type_name() { return "mds_resolve"; } - - void print(ostream& out) { - out << "mds_resolve(" << subtrees.size() - << "+" << ambiguous_imports.size() - << " subtrees +" << slave_requests.size() << " slave requests)"; - } - - void add_subtree(dirfrag_t im) { - subtrees[im].clear(); - } - void add_subtree_bound(dirfrag_t im, dirfrag_t ex) { - subtrees[im].push_back(ex); - } - - void add_ambiguous_import(dirfrag_t im, const list& m) { - ambiguous_imports[im] = m; - } - - void add_slave_request(metareqid_t reqid) { - slave_requests.push_back(reqid); - } - - void encode_payload() { - ::_encode(subtrees, payload); - ::_encode(ambiguous_imports, payload); - ::_encode(slave_requests, payload); - } - void decode_payload() { - int off = 0; - ::_decode(subtrees, payload, off); - ::_decode(ambiguous_imports, payload, off); - ::_decode(slave_requests, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSResolveAck.h b/branches/sage/pgs/messages/MMDSResolveAck.h deleted file mode 100644 index 1870e226b4161..0000000000000 --- a/branches/sage/pgs/messages/MMDSResolveAck.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVEACK_H -#define __MMDSRESOLVEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSResolveAck : public Message { - public: - list commit; - list abort; - - MMDSResolveAck() : Message(MSG_MDS_RESOLVEACK) {} - - char *get_type_name() { return "resolve_ack"; } - /*void print(ostream& out) { - out << "resolve_ack.size() - << "+" << ambiguous_imap.size() - << " imports +" << slave_requests.size() << " slave requests)"; - } - */ - - void add_commit(metareqid_t r) { - commit.push_back(r); - } - void add_abort(metareqid_t r) { - abort.push_back(r); - } - - void encode_payload() { - ::_encode(commit, payload); - ::_encode(abort, payload); - } - void decode_payload() { - int off = 0; - ::_decode(commit, payload, off); - ::_decode(abort, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMDSSlaveRequest.h b/branches/sage/pgs/messages/MMDSSlaveRequest.h deleted file mode 100644 index e2dbbd8f7298a..0000000000000 --- a/branches/sage/pgs/messages/MMDSSlaveRequest.h +++ /dev/null @@ -1,150 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSSLAVEREQUEST_H -#define __MMDSSLAVEREQUEST_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" -#include "include/encodable.h" - -class MMDSSlaveRequest : public Message { - public: - static const int OP_XLOCK = 1; - static const int OP_XLOCKACK = -1; - static const int OP_UNXLOCK = 2; - static const int OP_AUTHPIN = 3; - static const int OP_AUTHPINACK = -3; - - static const int OP_LINKPREP = 4; - static const int OP_UNLINKPREP = 5; - static const int OP_LINKPREPACK = -4; - - static const int OP_RENAMEPREP = 7; - static const int OP_RENAMEPREPACK = -7; - - static const int OP_RENAMEGETINODE = 8; - static const int OP_RENAMEGETINODEACK = -8; - - static const int OP_FINISH = 17; - - static const int OP_ABORT = 20; // used for recovery only - //static const int OP_COMMIT = 21; // used for recovery only - - - const static char *get_opname(int o) { - switch (o) { - case OP_XLOCK: return "xlock"; - case OP_XLOCKACK: return "xlock_ack"; - case OP_UNXLOCK: return "unxlock"; - case OP_AUTHPIN: return "authpin"; - case OP_AUTHPINACK: return "authpin_ack"; - - case OP_LINKPREP: return "link_prep"; - case OP_LINKPREPACK: return "link_prep_ack"; - case OP_UNLINKPREP: return "unlink_prep"; - - case OP_RENAMEPREP: return "rename_prep"; - case OP_RENAMEPREPACK: return "rename_prep_ack"; - case OP_RENAMEGETINODE: return "rename_get_inode"; - case OP_RENAMEGETINODEACK: return "rename_get_inode_ack"; - - case OP_FINISH: return "finish"; // commit - case OP_ABORT: return "abort"; - //case OP_COMMIT: return "commit"; - - default: assert(0); return 0; - } - } - - private: - metareqid_t reqid; - char op; - - // for locking - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - // for authpins - list authpins; - - public: - // for rename prep - string srcdnpath; - string destdnpath; - set srcdn_replicas; - bufferlist inode_export; - version_t inode_export_v; - utime_t now; - - bufferlist stray; // stray dir + dentry - -public: - metareqid_t get_reqid() { return reqid; } - int get_op() { return op; } - bool is_reply() { return op < 0; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - list& get_authpins() { return authpins; } - - void set_lock_type(int t) { lock_type = t; } - - // ---- - MMDSSlaveRequest() : Message(MSG_MDS_SLAVE_REQUEST) { } - MMDSSlaveRequest(metareqid_t ri, int o) : - Message(MSG_MDS_SLAVE_REQUEST), - reqid(ri), op(o) { } - void encode_payload() { - ::_encode(reqid, payload); - ::_encode(op, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode_complex(authpins, payload); - ::_encode(srcdnpath, payload); - ::_encode(destdnpath, payload); - ::_encode(srcdn_replicas, payload); - ::_encode(now, payload); - ::_encode(inode_export, payload); - ::_encode(inode_export_v, payload); - ::_encode(stray, payload); - } - void decode_payload() { - int off = 0; - ::_decode(reqid, payload, off); - ::_decode(op, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode_complex(authpins, payload, off); - ::_decode(srcdnpath, payload, off); - ::_decode(destdnpath, payload, off); - ::_decode(srcdn_replicas, payload, off); - ::_decode(now, payload, off); - ::_decode(inode_export, payload, off); - ::_decode(inode_export_v, payload, off); - ::_decode(stray, payload, off); - } - - char *get_type_name() { return "slave_request"; } - void print(ostream& out) { - out << "slave_request(" << reqid - << " " << get_opname(op) - << ")"; - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonCommand.h b/branches/sage/pgs/messages/MMonCommand.h deleted file mode 100644 index 19d25dd7a4d77..0000000000000 --- a/branches/sage/pgs/messages/MMonCommand.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMAND_H -#define __MMONCOMMAND_H - -#include "msg/Message.h" - -#include -using std::vector; - -class MMonCommand : public Message { - public: - entity_inst_t inst; - vector cmd; - - MMonCommand() : Message(MSG_MON_COMMAND) {} - MMonCommand(entity_inst_t i) : - Message(MSG_MON_COMMAND), - inst(i) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command("; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMANDACK_H -#define __MMONCOMMANDACK_H - -#include "msg/Message.h" - -class MMonCommandAck : public Message { - public: - int r; - string rs; - - MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {} - MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK), - r(_r), rs(s) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command_ack(" << r << " " << rs << ")"; - } - - void encode_payload() { - payload.append((char*)&r, sizeof(r)); - ::_encode(rs, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - ::_decode(rs, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonElection.h b/branches/sage/pgs/messages/MMonElection.h deleted file mode 100644 index 14a29af9140f9..0000000000000 --- a/branches/sage/pgs/messages/MMonElection.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTION_H -#define __MMONELECTION_H - -#include "msg/Message.h" - - -class MMonElection : public Message { -public: - static const int OP_PROPOSE = 1; - static const int OP_ACK = 2; - static const int OP_NAK = 3; - static const int OP_VICTORY = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_PROPOSE: return "propose"; - case OP_ACK: return "ack"; - case OP_NAK: return "nak"; - case OP_VICTORY: return "victory"; - default: assert(0); return 0; - } - } - - int32_t op; - epoch_t epoch; - - MMonElection() : Message(MSG_MON_ELECTION) {} - MMonElection(int o, epoch_t e) : - Message(MSG_MON_ELECTION), - op(o), epoch(e) {} - - char *get_type_name() { return "election"; } - void print(ostream& out) { - out << "election(" << get_opname(op) << " " << epoch << ")"; - } - - void encode_payload() { - ::_encode(op, payload); - ::_encode(epoch, payload); - } - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(epoch, payload, off); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonElectionCollect.h b/branches/sage/pgs/messages/MMonElectionCollect.h deleted file mode 100644 index f9f0c12d1ac2e..0000000000000 --- a/branches/sage/pgs/messages/MMonElectionCollect.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonElectionRefresh.h b/branches/sage/pgs/messages/MMonElectionRefresh.h deleted file mode 100644 index bc0337b8720dc..0000000000000 --- a/branches/sage/pgs/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonElectionStatus.h b/branches/sage/pgs/messages/MMonElectionStatus.h deleted file mode 100644 index f91e42d64b184..0000000000000 --- a/branches/sage/pgs/messages/MMonElectionStatus.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapInfo.h b/branches/sage/pgs/messages/MMonOSDMapInfo.h deleted file mode 100644 index 329c05e657d46..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapLease.h b/branches/sage/pgs/messages/MMonOSDMapLease.h deleted file mode 100644 index 3f4ed8ea4db85..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapLeaseAck.h b/branches/sage/pgs/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 449a0ac61a84f..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapUpdateAck.h b/branches/sage/pgs/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 9655548dfcb00..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapUpdateCommit.h b/branches/sage/pgs/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 8aa6929c2ed9a..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonOSDMapUpdatePrepare.h b/branches/sage/pgs/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index 8e908e2ed0664..0000000000000 --- a/branches/sage/pgs/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MMonPaxos.h b/branches/sage/pgs/messages/MMonPaxos.h deleted file mode 100644 index 7210b179c9a42..0000000000000 --- a/branches/sage/pgs/messages/MMonPaxos.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" -#include "mon/mon_types.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_BEGIN = 3; // proposer: value proposed for this round - const static int OP_ACCEPT = 4; // voter: accept propsed value - const static int OP_COMMIT = 5; // proposer: notify learners of agreed value - const static int OP_LEASE = 6; // leader: extend peon lease - const static int OP_LEASE_ACK = 7; // peon: lease ack - const static char *get_opname(int op) { - switch (op) { - case OP_COLLECT: return "collect"; - case OP_LAST: return "last"; - case OP_BEGIN: return "begin"; - case OP_ACCEPT: return "accept"; - case OP_COMMIT: return "commit"; - case OP_LEASE: return "lease"; - case OP_LEASE_ACK: return "lease_ack"; - default: assert(0); return 0; - } - } - - epoch_t epoch; // monitor epoch - int op; // paxos op - int machine_id; // which state machine? - - version_t last_committed; // i've committed to - version_t pn_from; // i promise to accept after - version_t pn; // with with proposal - version_t uncommitted_pn; // previous pn, if we are a LAST with an uncommitted value - utime_t lease_expire; - - map values; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(epoch_t e, int o, int mid) : - Message(MSG_MON_PAXOS), - epoch(e), - op(o), machine_id(mid), - last_committed(0), pn_from(0), pn(0), uncommitted_pn(0) { } - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(" << get_paxos_name(machine_id) - << " " << get_opname(op) << " lc " << last_committed - << " pn " << pn << " opn " << uncommitted_pn - << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(op, payload); - ::_encode(machine_id, payload); - ::_encode(last_committed, payload); - ::_encode(pn_from, payload); - ::_encode(pn, payload); - ::_encode(uncommitted_pn, payload); - ::_encode(lease_expire, payload); - ::_encode(values, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(op, payload, off); - ::_decode(machine_id, payload, off); - ::_decode(last_committed, payload, off); - ::_decode(pn_from, payload, off); - ::_decode(pn, payload, off); - ::_decode(uncommitted_pn, payload, off); - ::_decode(lease_expire, payload, off); - ::_decode(values, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDBoot.h b/branches/sage/pgs/messages/MOSDBoot.h deleted file mode 100644 index 00c94ad1a2a80..0000000000000 --- a/branches/sage/pgs/messages/MOSDBoot.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - entity_inst_t inst; - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(entity_inst_t i, OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - inst(i), - sb(s) { - } - - char *get_type_name() { return "osd_boot"; } - void print(ostream& out) { - out << "osd_boot(" << inst << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(sb, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(sb, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDFailure.h b/branches/sage/pgs/messages/MOSDFailure.h deleted file mode 100644 index 965d622a5f5e2..0000000000000 --- a/branches/sage/pgs/messages/MOSDFailure.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t from; - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t fr, entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - from(fr), failed(f), epoch(e) {} - - entity_inst_t get_from() { return from; } - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - ::_decode(from, payload, off); - ::_decode(failed, payload, off); - ::_decode(epoch, payload, off); - } - void encode_payload() { - ::_encode(from, payload); - ::_encode(failed, payload); - ::_encode(epoch, payload); - } - - virtual char *get_type_name() { return "osd_failure"; } - void print(ostream& out) { - out << "osd_failure(" << failed << " e" << epoch << ")"; - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDGetMap.h b/branches/sage/pgs/messages/MOSDGetMap.h deleted file mode 100644 index 68e1b7d137dae..0000000000000 --- a/branches/sage/pgs/messages/MOSDGetMap.h +++ /dev/null @@ -1,48 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t since; - - MOSDGetMap(epoch_t s=0) : - Message(MSG_OSD_GETMAP), - since(s) { - } - - epoch_t get_since() { return since; } - - char *get_type_name() { return "get_osd_map"; } - void print(ostream& out) { - out << "get_osd_map(since " << since << ")"; - } - - void encode_payload() { - payload.append((char*)&since, sizeof(since)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(since), (char*)&since); - off += sizeof(since); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDIn.h b/branches/sage/pgs/messages/MOSDIn.h deleted file mode 100644 index 8f8cb4b7877ae..0000000000000 --- a/branches/sage/pgs/messages/MOSDIn.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDMap.h b/branches/sage/pgs/messages/MOSDMap.h deleted file mode 100644 index b6de1b027557c..0000000000000 --- a/branches/sage/pgs/messages/MOSDMap.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : Message(MSG_OSD_MAP) { } - MOSDMap(OSDMap *oc) : Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } - void print(ostream& out) { - out << "osdmap(" << get_first() << "," << get_last() << ")"; - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDOp.h b/branches/sage/pgs/messages/MOSDOp.h deleted file mode 100644 index 96b389b119a7d..0000000000000 --- a/branches/sage/pgs/messages/MOSDOp.h +++ /dev/null @@ -1,252 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_REPLICATE 3 -#define OSD_OP_UNREPLICATE 4 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 -#define OSD_OP_MININCLOCK 26 // minimum incarnation lock - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - -#define OSD_OP_BALANCEREADS 101 -#define OSD_OP_UNBALANCEREADS 102 - - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_MININCLOCK: return "mininclock"; - - case OSD_OP_BALANCEREADS: return "balance-reads"; - case OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct { - // who's asking? - entity_inst_t client; - osdreqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - ObjectLayout layout; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int op; - size_t length; - off_t offset; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - bool retry_attempt; - } st; - - bufferlist data; - map attrset; - double request_received_time; - - - friend class MOSDOpReply; - - public: - const osdreqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - bool wants_reply() { - if (st.op < 100) return true; - return false; // no reply needed for primary-lock, -unlock. - } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - bool get_retry_attempt() const { return st.retry_attempt; } - void set_retry_attempt(bool a) { st.retry_attempt = a; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.layout.pgid; } - const ObjectLayout& get_layout() { return st.layout; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - bool is_read() { - return st.op < 10; - } - - const size_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset = as; } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - void set_received_time(double time) { - request_received_time = time; - } - double get_received_time() { - return request_received_time; - } - - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - size_t get_data_len() { return data.length(); } - - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, ObjectLayout ol, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.layout = ol; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_layout(const ObjectLayout& l) { st.layout = l; } - - void set_length(size_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op"; } - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid; - if (st.retry_attempt) out << " RETRY"; - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/pgs/messages/MOSDOpReply.h b/branches/sage/pgs/messages/MOSDOpReply.h deleted file mode 100644 index e81f14d4558b1..0000000000000 --- a/branches/sage/pgs/messages/MOSDOpReply.h +++ /dev/null @@ -1,153 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - osdreqid_t reqid; - - tid_t rep_tid; - - object_t oid; - ObjectLayout layout; // pgid, etc. - - int op; - - // reply - int result; - bool commit; - size_t length, offset; - size_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - } st; - - bufferlist data; - map attrset; - - public: - const osdreqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.layout.pgid; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - size_t get_length() { return st.length; } - size_t get_offset() { return st.offset; } - size_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(size_t s) { st.length = s; } - void set_offset(size_t o) { st.offset = o; } - void set_object_size(size_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.layout = req->st.layout; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op_reply"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid; - if (st.commit) - out << " commit"; - else - out << " ack"; - out << " = " << st.result; - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/pgs/messages/MOSDOut.h b/branches/sage/pgs/messages/MOSDOut.h deleted file mode 100644 index 798356f663f9e..0000000000000 --- a/branches/sage/pgs/messages/MOSDOut.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGLog.h b/branches/sage/pgs/messages/MOSDPGLog.h deleted file mode 100644 index b7ed19dd64d4b..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGLog.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, pg_t pgid) : - Message(MSG_OSD_PG_LOG) { - this->epoch = mv; - this->pgid = pgid; - } - - char *get_type_name() { return "PGlog"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGNotify.h b/branches/sage/pgs/messages/MOSDPGNotify.h deleted file mode 100644 index 76a984276b66b..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGNotify.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGPeer.h b/branches/sage/pgs/messages/MOSDPGPeer.h deleted file mode 100644 index dd3164cdc1124..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGPeer.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - uint64_t map_version; - list pg_list; - - bool complete; - - public: - uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGPeerAck.h b/branches/sage/pgs/messages/MOSDPGPeerAck.h deleted file mode 100644 index dc4fac1a9436b..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,70 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - version_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - version_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(version_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - version_t map_version; - list pg_list; - - public: - version_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(version_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGQuery.h b/branches/sage/pgs/messages/MOSDPGQuery.h deleted file mode 100644 index 70dbfdbb96fd7..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGQuery.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGRemove.h b/branches/sage/pgs/messages/MOSDPGRemove.h deleted file mode 100644 index 17cb28a3c95a1..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGRemove.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGSummary.h b/branches/sage/pgs/messages/MOSDPGSummary.h deleted file mode 100644 index f41c6954b4c27..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGSummary.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPGUpdate.h b/branches/sage/pgs/messages/MOSDPGUpdate.h deleted file mode 100644 index 20453b3e73e2f..0000000000000 --- a/branches/sage/pgs/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/branches/sage/pgs/messages/MOSDPing.h b/branches/sage/pgs/messages/MOSDPing.h deleted file mode 100644 index 739875479749d..0000000000000 --- a/branches/sage/pgs/messages/MOSDPing.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "common/Clock.h" - -#include "msg/Message.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - float avg_qlen; - double read_mean_time; - - MOSDPing(epoch_t e, - float aq, - double _read_mean_time, - bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq), read_mean_time(_read_mean_time) { - } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen); - off += sizeof(avg_qlen); - payload.copy(off, sizeof(read_mean_time), (char*)&read_mean_time); - off += sizeof(read_mean_time); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&avg_qlen, sizeof(avg_qlen)); - payload.append((char*)&read_mean_time, sizeof(read_mean_time)); - } - - virtual char *get_type_name() { return "oping"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MPing.h b/branches/sage/pgs/messages/MPing.h deleted file mode 100644 index 6b569666ed377..0000000000000 --- a/branches/sage/pgs/messages/MPing.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/branches/sage/pgs/messages/MPingAck.h b/branches/sage/pgs/messages/MPingAck.h deleted file mode 100644 index f8f32aee43ee0..0000000000000 --- a/branches/sage/pgs/messages/MPingAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/branches/sage/pgs/mkmonmap.cc b/branches/sage/pgs/mkmonmap.cc deleted file mode 100644 index 0b0813cb3cfea..0000000000000 --- a/branches/sage/pgs/mkmonmap.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/branches/sage/pgs/mon/ClientMonitor.cc b/branches/sage/pgs/mon/ClientMonitor.cc deleted file mode 100644 index 175f70477f7a5..0000000000000 --- a/branches/sage/pgs/mon/ClientMonitor.cc +++ /dev/null @@ -1,237 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " - - - -bool ClientMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my v " << client_map.version << endl; - - if (paxosv == client_map.version) return true; - assert(paxosv >= client_map.version); - - if (client_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("clientmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full clientmap" << endl; - bufferlist bl; - mon->store->get_bl_ss(bl, "clientmap", "latest"); - int off = 0; - client_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > client_map.version) { - bufferlist bl; - bool success = paxos->read(client_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << client_map.version+1 << endl; - Incremental inc; - int off = 0; - inc._decode(bl, off); - client_map.apply_incremental(inc); - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << client_map.version+1 << endl; - return false; - } - } - - // save latest - bufferlist bl; - client_map._encode(bl); - mon->store->put_bl_ss(bl, "clientmap", "latest"); - - return true; -} - -void ClientMonitor::create_pending() -{ - assert(mon->is_leader()); - pending_inc = Incremental(); - pending_inc.version = client_map.version + 1; - pending_inc.next_client = client_map.next_client; - dout(10) << "create_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << endl; -} - -void ClientMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << endl; -} - - - -void ClientMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << endl; - - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - - -// ------- - - -bool ClientMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << endl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - // already mounted? - entity_addr_t addr = m->get_source_addr(); - if (client_map.addr_client.count(addr)) { - int client = client_map.addr_client[addr]; - dout(7) << " client" << client << " already mounted" << endl; - _mounted(client, (MClientMount*)m); - return true; - } - } - return false; - - case MSG_CLIENT_UNMOUNT: - { - // already unmounted? - int client = m->get_source().num(); - if (client_map.client_addr.count(client) == 0) { - dout(7) << " client" << client << " not mounted" << endl; - _unmounted((MClientUnmount*)m); - return true; - } - } - return false; - - - default: - assert(0); - delete m; - return true; - } -} - -bool ClientMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << endl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - MClientMount *mount = (MClientMount*)m; - entity_addr_t addr = mount->addr; - int client = -1; - if (mount->get_source().is_client()) - client = mount->get_source().num(); - - // choose a client id - if (client < 0 || - (client_map.client_addr.count(client) && - client_map.client_addr[client] != addr)) { - client = pending_inc.next_client; - dout(10) << "mount: assigned client" << client << " to " << addr << endl; - } else { - dout(10) << "mount: client" << client << " requested by " << addr << endl; - } - - pending_inc.add_mount(client, addr); - paxos->wait_for_commit(new C_Mounted(this, client, mount)); - } - return true; - - case MSG_CLIENT_UNMOUNT: - { - MClientUnmount *unmount = (MClientUnmount*)m; - assert(unmount->inst.name.is_client()); - int client = unmount->inst.name.num(); - - assert(client_map.client_addr.count(client)); - - pending_inc.add_unmount(client); - paxos->wait_for_commit(new C_Unmounted(this, unmount)); - } - return true; - - default: - assert(0); - delete m; - return false; - } - -} - - -// MOUNT - - -void ClientMonitor::_mounted(int client, MClientMount *m) -{ - entity_inst_t to; - to.addr = m->addr; - to.name = MSG_ADDR_CLIENT(client); - - dout(10) << "_mounted client" << client << " at " << to << endl; - - // reply with latest mds, osd maps - mon->mdsmon->send_latest(to); - mon->osdmon->send_latest(to); - - delete m; -} - -void ClientMonitor::_unmounted(MClientUnmount *m) -{ - dout(10) << "_unmounted " << m->inst << endl; - - // reply with (same) unmount message - mon->messenger->send_message(m, m->inst); - - // auto-shutdown? - // (hack for fakesyn/newsyn, mostly) - if (mon->is_leader() && - client_map.version > 1 && - client_map.client_addr.empty() && - g_conf.mon_stop_on_last_unmount) { - dout(1) << "last client unmounted" << endl; - mon->do_stop(); - } -} - - diff --git a/branches/sage/pgs/mon/ClientMonitor.h b/branches/sage/pgs/mon/ClientMonitor.h deleted file mode 100644 index 8321202fc24f1..0000000000000 --- a/branches/sage/pgs/mon/ClientMonitor.h +++ /dev/null @@ -1,176 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class Monitor; -class Paxos; -class MClientMount; -class MClientUnmount; - -class ClientMonitor : public PaxosService { -public: - - struct Incremental { - version_t version; - uint32_t next_client; - map mount; - set unmount; - - Incremental() : version(0), next_client() {} - - bool is_empty() { return mount.empty() && unmount.empty(); } - void add_mount(uint32_t client, entity_addr_t addr) { - next_client = MAX(next_client, client+1); - mount[client] = addr; - } - void add_unmount(uint32_t client) { - assert(client < next_client); - if (mount.count(client)) - mount.erase(client); - else - unmount.insert(client); - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(mount, bl); - ::_encode(unmount, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(mount, bl, off); - ::_decode(unmount, bl, off); - } - }; - - struct Map { - version_t version; - uint32_t next_client; - map client_addr; - hash_map addr_client; - - Map() : version(0), next_client(0) {} - - void reverse() { - addr_client.clear(); - for (map::iterator p = client_addr.begin(); - p != client_addr.end(); - ++p) { - addr_client[p->second] = p->first; - } - } - void apply_incremental(Incremental &inc) { - assert(inc.version == version+1); - version = inc.version; - next_client = inc.next_client; - for (map::iterator p = inc.mount.begin(); - p != inc.mount.end(); - ++p) { - client_addr[p->first] = p->second; - addr_client[p->second] = p->first; - } - - for (set::iterator p = inc.unmount.begin(); - p != inc.unmount.end(); - ++p) { - assert(client_addr.count(*p)); - addr_client.erase(client_addr[*p]); - client_addr.erase(*p); - } - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(client_addr, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(client_addr, bl, off); - reverse(); - } - }; - - class C_Mounted : public Context { - ClientMonitor *cmon; - int client; - MClientMount *m; - public: - C_Mounted(ClientMonitor *cm, int c, MClientMount *m_) : - cmon(cm), client(c), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_mounted(client, m); - else - cmon->dispatch((Message*)m); - } - }; - - class C_Unmounted : public Context { - ClientMonitor *cmon; - MClientUnmount *m; - public: - C_Unmounted(ClientMonitor *cm, MClientUnmount *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_unmounted(m); - else - cmon->dispatch((Message*)m); - } - }; - - -private: - Map client_map; - - // leader - Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void _mounted(int c, MClientMount *m); - void _unmounted(MClientUnmount *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - - public: - ClientMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/pgs/mon/Elector.cc b/branches/sage/pgs/mon/Elector.cc deleted file mode 100644 index 816946d3cbfe3..0000000000000 --- a/branches/sage/pgs/mon/Elector.cc +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" -#include "MonitorStore.h" -#include "messages/MMonElection.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " - - -void Elector::init() -{ - epoch = mon->store->get_int("mon_epoch"); - if (!epoch) - epoch = 1; - dout(1) << "init, last seen epoch " << epoch << endl; -} - -void Elector::shutdown() -{ - if (expire_event) - mon->timer.cancel_event(expire_event); -} - -void Elector::bump_epoch(epoch_t e) -{ - dout(10) << "bump_epoch " << epoch << " to " << e << endl; - assert(epoch < e); - epoch = e; - mon->store->put_int(epoch, "mon_epoch"); - - // clear up some state - electing_me = false; - acked_me.clear(); - leader_acked = -1; -} - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << endl; - - // start by trying to elect me - if (epoch % 2 == 0) - bump_epoch(epoch+1); // odd == election cycle - start_stamp = g_clock.now(); - electing_me = true; - acked_me.insert(whoami); - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch), - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << endl; - - if (electing_me) { - // drop out - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch), - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_ElectionExpire(this); - mon->timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) { - mon->timer.cancel_event(expire_event); - expire_event = 0; - } -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << endl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - set quorum = acked_me; - - cancel_timer(); - - assert(epoch % 2 == 1); // election - bump_epoch(epoch+1); // is over! - - // tell everyone - for (set::iterator p = quorum.begin(); - p != quorum.end(); - ++p) { - if (*p == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_VICTORY, epoch), - mon->monmap->get_inst(*p)); - } - - // tell monitor - mon->win_election(epoch, quorum); -} - - -void Elector::handle_propose(MMonElection *m) -{ - dout(5) << "handle_propose from " << m->get_source() << endl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - bump_epoch(m->epoch); - } - else if (m->epoch < epoch && // got an "old" propose, - epoch % 2 == 0 && // in a non-election cycle - mon->quorum.count(from) == 0) { // from someone outside the quorum - // a mon just started up, call a new election so they can rejoin! - dout(5) << " got propose from old epoch, " << m->get_source() << " must have just started" << endl; - start(); - } - - if (whoami < from) { - // i would win over them. - if (leader_acked >= 0) { // we already acked someone - assert(leader_acked < from); // and they still win, of course - dout(5) << "no, we already acked " << leader_acked << endl; - } else { - // wait, i should win! - if (!electing_me) - start(); - } - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << endl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElection *m) -{ - dout(5) << "handle_ack from " << m->get_source() << endl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - dout(5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << endl; - bump_epoch(m->epoch); - start(); - delete m; - return; - } - assert(m->epoch == epoch); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << endl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - assert(leader_acked >= 0); - } - - delete m; -} - - -void Elector::handle_victory(MMonElection *m) -{ - dout(5) << "handle_victory from " << m->get_source() << endl; - int from = m->get_source().num(); - - assert(from < whoami); - assert(m->epoch % 2 == 0); - assert(m->epoch == epoch + 1); // i should have seen this election if i'm getting the victory. - bump_epoch(m->epoch); - - // they win - mon->lose_election(epoch, from); - - // cancel my timer - cancel_timer(); -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_ELECTION: - { - MMonElection *em = (MMonElection*)m; - - switch (em->op) { - case MMonElection::OP_PROPOSE: - handle_propose(em); - return; - } - - if (em->epoch < epoch) { - dout(5) << "old epoch, dropping" << endl; - delete em; - break; - } - - switch (em->op) { - case MMonElection::OP_ACK: - handle_ack(em); - return; - case MMonElection::OP_VICTORY: - handle_victory(em); - return; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - diff --git a/branches/sage/pgs/mon/Elector.h b/branches/sage/pgs/mon/Elector.h deleted file mode 100644 index 9bfd7cb644fc7..0000000000000 --- a/branches/sage/pgs/mon/Elector.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - epoch_t epoch; // latest epoch we've seen. odd == election, even == stable, - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - void bump_epoch(epoch_t e=0); // i just saw a larger epoch - - class C_ElectionExpire : public Context { - Elector *elector; - public: - C_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } - }; - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElection *m); - void handle_ack(class MMonElection *m); - void handle_victory(class MMonElection *m); - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w), - expire_event(0), - epoch(0), - electing_me(false), - leader_acked(-1) { } - - void init(); - void shutdown(); - - void dispatch(Message *m); - - void call_election() { - start(); - } - -}; - - -#endif diff --git a/branches/sage/pgs/mon/MDSMonitor.cc b/branches/sage/pgs/mon/MDSMonitor.cc deleted file mode 100644 index 7e3beae7971d1..0000000000000 --- a/branches/sage/pgs/mon/MDSMonitor.cc +++ /dev/null @@ -1,544 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" -#include "OSDMonitor.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MGenericMessage.h" - - -#include "common/Timer.h" - -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -// my methods - -void MDSMonitor::print_map(MDSMap &m) -{ - dout(7) << "print_map epoch " << m.get_epoch() << " target_num " << m.target_num << endl; - entity_inst_t blank; - set all; - m.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - dout(7) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << endl; - } -} - - - -// service methods - -void MDSMonitor::create_initial() -{ - dout(10) << "create_initial" << endl; - pending_mdsmap.target_num = g_conf.num_mds; - pending_mdsmap.created = g_clock.now(); - print_map(pending_mdsmap); -} - -bool MDSMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my e " << mdsmap.epoch << endl; - - if (paxosv == mdsmap.epoch) return true; - assert(paxosv >= mdsmap.epoch); - - // read and decode - mdsmap_bl.clear(); - bool success = paxos->read(paxosv, mdsmap_bl); - assert(success); - dout(10) << "update_from_paxos got " << paxosv << endl; - mdsmap.decode(mdsmap_bl); - - // new map - print_map(mdsmap); - - // bcast map to mds, waiters - if (mon->is_leader()) - bcast_latest_mds(); - send_to_waiting(); - - // hackish: did all mds's shut down? - if (mon->is_leader() && - g_conf.mon_stop_with_last_mds && - mdsmap.get_epoch() > 1 && - mdsmap.is_stopped()) - mon->messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - mon->monmap->get_inst(mon->whoami)); - - return true; -} - -void MDSMonitor::create_pending() -{ - pending_mdsmap = mdsmap; - pending_mdsmap.epoch++; - dout(10) << "create_pending e" << pending_mdsmap.epoch << endl; -} - -void MDSMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e" << pending_mdsmap.epoch << endl; - - print_map(pending_mdsmap); - - // apply to paxos - assert(paxos->get_version() + 1 == pending_mdsmap.epoch); - pending_mdsmap.encode(bl); -} - - -bool MDSMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << endl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return preprocess_beacon((MMDSBeacon*)m); - - case MSG_MDS_GETMAP: - send_full(m->get_source_inst()); - return true; - - case MSG_MON_COMMAND: - return false; - - default: - assert(0); - delete m; - return true; - } -} - - -bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) -{ - dout(12) << "preprocess_beacon " << *m - << " from " << m->get_mds_inst() - << endl; - - // fw to leader? - if (!mon->is_leader()) { - dout(10) << "fw to leader" << endl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return true; - } - - // let's see. - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - // can i handle this query without a map update? - - // boot? - if (state == MDSMap::STATE_BOOT) { - // already booted? - int already = mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (already < 0) - return false; // need to update map - - // already booted. just reply to beacon, as per usual. - from = already; - } - - // reply to beacon - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl; - delete m; - return true; - } - - // reply to beacon? - if (state != MDSMap::STATE_STOPPED) { - last_beacon[from] = g_clock.now(); // note time - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), state, seq), - m->get_mds_inst()); - } - - // is there a state change here? - if (mdsmap.mds_state.count(from) == 0 || - mdsmap.mds_state[from] != state) - return false; // yep, need to update map. - - // we're done. - delete m; - return true; -} - - -bool MDSMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << endl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return handle_beacon((MMDSBeacon*)m); - - case MSG_MON_COMMAND: - return handle_command((MMonCommand*)m); - - default: - assert(0); - delete m; - } - - return true; -} - -bool MDSMonitor::should_propose_now() -{ - return true; -} - - -bool MDSMonitor::handle_beacon(MMDSBeacon *m) -{ - // -- this is an update -- - dout(12) << "handle_beacon " << *m - << " from " << m->get_mds_inst() - << endl; - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - assert(state != mdsmap.get_state(from)); - - // boot? - if (state == MDSMap::STATE_BOOT) { - // assign a name. - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (mdsmap.is_failed(from)) { - dout(10) << "mds_beacon boot: mds" << from << " was failed, replaying" << endl; - state = MDSMap::STATE_REPLAY; - } else if (mdsmap.is_stopped(from)) { - dout(10) << "mds_beacon boot: mds" << from << " was stopped, starting" << endl; - state = MDSMap::STATE_STARTING; - } else if (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst()) { - dout(10) << "mds_beacon boot: mds" << from << " is someone else" << endl; - from = -1; - } - } - if (from < 0) { - from = pending_mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (from >= 0) { - state = pending_mdsmap.mds_state[from]; - dout(10) << "mds_beacon boot: already pending mds" << from - << " " << MDSMap::get_state_name(state) << endl; - delete m; - return false; - } - } - if (from < 0) { - // pick a failed mds? - set failed; - pending_mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon boot: assigned failed mds" << from << endl; - state = MDSMap::STATE_REPLAY; - } - } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (pending_mdsmap.is_dne(from)) { - dout(10) << "mds_beacon boot: assigned new mds" << from << endl; - state = MDSMap::STATE_CREATING; - break; - } else if (pending_mdsmap.is_stopped(from)) { - dout(10) << "mds_beacon boot: assigned stopped mds" << from << endl; - state = MDSMap::STATE_STARTING; - break; - } - } - } - - assert(state == MDSMap::STATE_CREATING || - state == MDSMap::STATE_STARTING || - state == MDSMap::STATE_REPLAY); - - // put it in the map. - pending_mdsmap.mds_inst[from].addr = m->get_mds_inst().addr; - pending_mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); - pending_mdsmap.mds_inc[from]++; - - // someone (new) has joined the cluster. - pending_mdsmap.same_inst_since = pending_mdsmap.epoch; - - // reset the beacon timer - last_beacon[from] = g_clock.now(); - } - - // created? - if (state == MDSMap::STATE_ACTIVE && - mdsmap.is_creating(from)) { - pending_mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << endl; - } - - // if starting|creating and degraded|full, go to standby - if ((state == MDSMap::STATE_STARTING || - state == MDSMap::STATE_CREATING || - mdsmap.is_starting(from) || - mdsmap.is_creating(from)) && - (pending_mdsmap.is_degraded() || - pending_mdsmap.is_full())) { - dout(10) << "mds_beacon cluster degraded|full, mds" << from << " will be standby" << endl; - state = MDSMap::STATE_STANDBY; - } - - // update the map - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << endl; - - // did someone leave the cluster? - if (state == MDSMap::STATE_STOPPED && - !mdsmap.is_stopped(from)) - pending_mdsmap.same_inst_since = pending_mdsmap.epoch; - - // change the state - pending_mdsmap.mds_state[from] = state; - if (pending_mdsmap.is_up(from)) - pending_mdsmap.mds_state_seq[from] = seq; - else - pending_mdsmap.mds_state_seq.erase(from); - - dout(7) << "pending map now:" << endl; - print_map(pending_mdsmap); - - paxos->wait_for_commit(new C_Updated(this, from, m)); - - return true; -} - - -void MDSMonitor::_updated(int from, MMDSBeacon *m) -{ - if (m->get_state() == MDSMap::STATE_BOOT) { - dout(10) << "_updated (booted) mds" << from << " " << *m << endl; - mon->osdmon->send_latest(mdsmap.get_inst(from)); - } else { - dout(10) << "_updated mds" << from << " " << *m << endl; - } - if (m->get_state() == MDSMap::STATE_STOPPED) { - // send the map manually (they're out of the map, so they won't get it automatic) - send_latest(m->get_mds_inst()); - } - delete m; -} - - - -bool MDSMonitor::handle_command(MMonCommand *m) -{ - int r = -EINVAL; - stringstream ss; - - if (m->cmd.size() > 1) { - if (m->cmd[1] == "stop" && m->cmd.size() > 2) { - int who = atoi(m->cmd[2].c_str()); - if (mdsmap.is_active(who)) { - r = 0; - ss << "telling mds" << who << " to stop"; - pending_mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; - } else { - r = -EEXIST; - ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; - } - } - else if (m->cmd[1] == "set_target_num" && m->cmd.size() > 2) { - pending_mdsmap.target_num = atoi(m->cmd[2].c_str()); - r = 0; - ss << "target_num = " << pending_mdsmap.target_num << endl; - } - } - if (r == -EINVAL) { - ss << "unrecognized command"; - } - - // reply - string rs; - getline(ss,rs); - mon->messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; - return r >= 0; -} - - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << endl; - mon->messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << mdsmap.get_epoch() << endl; - for (list::iterator i = waiting_for_map.begin(); - i != waiting_for_map.end(); - i++) - send_full(*i); - waiting_for_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - if (paxos->is_readable()) - send_full(dest); - else - waiting_for_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - utime_t now = g_clock.now(); - - // ...if i am an active leader - if (!mon->is_leader()) return; - if (!paxos->is_active()) return; - - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; - - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_STOPPED; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STARTING: - newstate = MDSMap::STATE_STOPPED; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << endl; - - // update map - pending_mdsmap.mds_state[*p] = newstate; - pending_mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << endl; - last_beacon[*p] = now; - } - } - - if (changed) - propose_pending(); - } -} - - -void MDSMonitor::do_stop() -{ - // hrm... - if (!mon->is_leader() || - !paxos->is_active()) { - dout(-10) << "do_stop can't stop right now, mdsmap not writeable" << endl; - return; - } - - dout(10) << "do_stop stopping active mds nodes" << endl; - - print_map(mdsmap); - for (map::iterator p = mdsmap.mds_state.begin(); - p != mdsmap.mds_state.end(); - ++p) - if (mdsmap.is_active(p->first)) - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; - - propose_pending(); -} diff --git a/branches/sage/pgs/mon/MDSMonitor.h b/branches/sage/pgs/mon/MDSMonitor.h deleted file mode 100644 index 082423aec33a0..0000000000000 --- a/branches/sage/pgs/mon/MDSMonitor.h +++ /dev/null @@ -1,96 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class MMDSBeacon; - -class MDSMonitor : public PaxosService { - public: - // mds maps - MDSMap mdsmap; // current - bufferlist mdsmap_bl; // encoded - - MDSMap pending_mdsmap; // current + pending updates - - // my helpers - void print_map(MDSMap &m); - - class C_Updated : public Context { - MDSMonitor *mm; - int mds; - MMDSBeacon *m; - public: - C_Updated(MDSMonitor *a, int b, MMDSBeacon *c) : - mm(a), mds(b), m(c) {} - void finish(int r) { - if (r >= 0) - mm->_updated(mds, m); // success - else - mm->dispatch((Message*)m); // try again - } - }; - - - // service methods - void create_initial(); - bool update_from_paxos(); - void create_pending(); - void encode_pending(bufferlist &bl); - - void _updated(int m, MMDSBeacon *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose_now(); - - bool preprocess_beacon(class MMDSBeacon *m); - bool handle_beacon(class MMDSBeacon *m); - bool handle_command(class MMonCommand *m); - - // beacons - map last_beacon; - -public: - MDSMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - // sending the map -private: - list waiting_for_map; - - void bcast_latest_mds(); - void send_full(entity_inst_t dest); - void send_to_waiting(); - -public: - void send_latest(entity_inst_t dest); - - void tick(); // check state, take actions - void do_stop(); - -}; - -#endif diff --git a/branches/sage/pgs/mon/MonMap.h b/branches/sage/pgs/mon/MonMap.h deleted file mode 100644 index eb18579cd7e99..0000000000000 --- a/branches/sage/pgs/mon/MonMap.h +++ /dev/null @@ -1,105 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch/version of the monmap - int num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&num_mon, sizeof(num_mon)); - - _encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(num_mon), (char*)&num_mon); - off += sizeof(num_mon); - - _decode(mon_inst, blist, off); - } - - // read from/write to a file - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/branches/sage/pgs/mon/Monitor.cc b/branches/sage/pgs/mon/Monitor.cc deleted file mode 100644 index 299dbaf2f11c7..0000000000000 --- a/branches/sage/pgs/mon/Monitor.cc +++ /dev/null @@ -1,399 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" -#include "PGMonitor.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << endl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, &paxos_osdmap); - mdsmon = new MDSMonitor(this, &paxos_mdsmap); - clientmon = new ClientMonitor(this, &paxos_clientmap); - pgmon = new PGMonitor(this, &paxos_pgmap); - - // init paxos - paxos_test.init(); - paxos_osdmap.init(); - paxos_mdsmap.init(); - paxos_clientmap.init(); - paxos_pgmap.init(); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(1, q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << endl; - - elector.shutdown(); - - if (is_leader()) { - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << endl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - } - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - if (pgmon) delete pgmon; - - // die. - messenger->shutdown(); - delete messenger; -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << endl; - state = STATE_STARTING; - - // tell paxos - paxos_test.election_starting(); - paxos_mdsmap.election_starting(); - paxos_osdmap.election_starting(); - paxos_clientmap.election_starting(); - - // call a new election - elector.call_election(); -} - -void Monitor::win_election(epoch_t epoch, set& active) -{ - state = STATE_LEADER; - leader = whoami; - mon_epoch = epoch; - quorum = active; - dout(10) << "win_election, epoch " << mon_epoch << " quorum is " << quorum << endl; - - // init paxos - paxos_test.leader_init(); - paxos_mdsmap.leader_init(); - paxos_osdmap.leader_init(); - paxos_clientmap.leader_init(); - paxos_pgmap.leader_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - -void Monitor::lose_election(epoch_t epoch, int l) -{ - state = STATE_PEON; - mon_epoch = epoch; - leader = l; - dout(10) << "lose_election, epoch " << mon_epoch << " leader is mon" << leader << endl; - - // init paxos - paxos_test.peon_init(); - paxos_mdsmap.peon_init(); - paxos_osdmap.peon_init(); - paxos_clientmap.peon_init(); - paxos_pgmap.peon_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - - -void Monitor::handle_command(MMonCommand *m) -{ - dout(0) << "handle_command " << *m << endl; - - int r = -1; - string rs = "unrecognized command"; - - if (!m->cmd.empty()) { - if (m->cmd[0] == "stop") { - r = 0; - rs = "stopping"; - do_stop(); - } - else if (m->cmd[0] == "mds") { - mdsmon->dispatch(m); - return; - } - else if (m->cmd[0] == "osd") { - - } - } - - // reply - messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; -} - - -void Monitor::do_stop() -{ - dout(0) << "do_stop -- shutting down" << endl; - mdsmon->do_stop(); -} - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - if (m->get_source().is_osd()) - osdmon->dispatch(m); - else - handle_shutdown(m); - break; - - case MSG_MON_COMMAND: - handle_command((MMonCommand*)m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - break; - - // clients - case MSG_CLIENT_MOUNT: - case MSG_CLIENT_UNMOUNT: - clientmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // sanitize - if (pm->epoch > mon_epoch) - call_election(); - if (pm->epoch != mon_epoch) { - delete pm; - break; - } - - // send it to the right paxos instance - switch (pm->machine_id) { - case PAXOS_TEST: - paxos_test.dispatch(m); - break; - case PAXOS_OSDMAP: - paxos_osdmap.dispatch(m); - break; - case PAXOS_MDSMAP: - paxos_mdsmap.dispatch(m); - break; - case PAXOS_CLIENTMAP: - paxos_clientmap.dispatch(m); - break; - default: - assert(0); - } - } - break; - - // elector messages - case MSG_MON_ELECTION: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << *m << endl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - assert(m->get_source().is_mon()); - if (m->get_source().num() == get_leader()) { - dout(1) << "shutdown from leader " << m->get_source() << endl; - shutdown(); - } else { - dout(1) << "ignoring shutdown from non-leader " << m->get_source() << endl; - } - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << endl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/branches/sage/pgs/mon/Monitor.h b/branches/sage/pgs/mon/Monitor.h deleted file mode 100644 index 934916760e28c..0000000000000 --- a/branches/sage/pgs/mon/Monitor.h +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; -class PGMonitor; - -class Monitor : public Dispatcher { -public: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // -- local storage -- -public: - MonitorStore *store; - - // -- monitor state -- -private: - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - -public: - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - - - // -- elector -- -private: - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - int leader; // current leader (to best of knowledge) - set quorum; // current active set of monitors (if !starting) - utime_t last_called_election; // [starting] last time i called an election - -public: - epoch_t get_epoch() { return mon_epoch; } - int get_leader() { return leader; } - const set& get_quorum() { return quorum; } - - void call_election(); // initiate election - void win_election(epoch_t epoch, set& q); // end election (called by Elector) - void lose_election(epoch_t epoch, int l); // end election (called by Elector) - - - // -- paxos -- - Paxos paxos_test; - Paxos paxos_mdsmap; - Paxos paxos_osdmap; - Paxos paxos_clientmap; - Paxos paxos_pgmap; - friend class Paxos; - - - // -- services -- - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - PGMonitor *pgmon; - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - friend class PGMonitor; - - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_command(class MMonCommand *m); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - - state(STATE_STARTING), - - elector(this, w), - mon_epoch(0), - leader(0), - - paxos_test(this, w, PAXOS_TEST), - paxos_mdsmap(this, w, PAXOS_MDSMAP), - paxos_osdmap(this, w, PAXOS_OSDMAP), - paxos_clientmap(this, w, PAXOS_CLIENTMAP), - paxos_pgmap(this, w, PAXOS_PGMAP), - - osdmon(0), mdsmon(0), clientmon(0) - { - } - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - - void do_stop(); - -}; - -#endif diff --git a/branches/sage/pgs/mon/MonitorStore.cc b/branches/sage/pgs/mon/MonitorStore.cc deleted file mode 100644 index d260dfd7604e4..0000000000000 --- a/branches/sage/pgs/mon/MonitorStore.cc +++ /dev/null @@ -1,226 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << endl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << endl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - delete cwd; - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << endl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << endl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << endl; - } else { - dout(15) << "get_int " << a << " = " << val << endl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << endl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd > 0); - ::fchmod(fd, 0644); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << endl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - //dout(15) << "exists_bl stat " << fn << " r=" << r << " errno " << errno << " " << strerror(errno) << endl; - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << endl; - } else { - dout(15) << "get_bl " << a << " DNE" << endl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << endl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << endl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd); - - // chmod - ::fchmod(fd, 0644); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << endl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << endl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/branches/sage/pgs/mon/MonitorStore.h b/branches/sage/pgs/mon/MonitorStore.h deleted file mode 100644 index 485bf972551c4..0000000000000 --- a/branches/sage/pgs/mon/MonitorStore.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/branches/sage/pgs/mon/OSDMonitor.cc b/branches/sage/pgs/mon/OSDMonitor.cc deleted file mode 100644 index 6004ac2d24be0..0000000000000 --- a/branches/sage/pgs/mon/OSDMonitor.cc +++ /dev/null @@ -1,807 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") " - - -// FAKING - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << endl; - pending_inc.new_down[osd] = osdmap.osd_inst[osd]; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << endl; - pending_inc.new_out.push_back(osd); - } - propose_pending(); - - // fixme - //bcast_latest_osd(); - //bcast_latest_mds(); -} - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << endl; - propose_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_latest(osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << endl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << endl; - pending_inc.new_out.push_back(r); - } - - propose_pending(); - send_latest(osdmap.get_inst(r)); // after -} - - - -/************ MAPS ****************/ - -void OSDMonitor::create_initial() -{ - assert(mon->is_leader()); - assert(paxos->get_version() == 0); - - dout(1) << "create_initial -- creating initial osdmap from g_conf" << endl; - - // - OSDMap newmap; - newmap.mon_epoch = mon->mon_epoch; - newmap.ctime = g_clock.now(); - - if (g_conf.osd_pg_bits) { - newmap.set_pg_num(1 << g_conf.osd_pg_bits); - } else { - // 4 bits of pgs per osd. - newmap.set_pg_num(g_conf.num_osd << 4); - } - - // start at epoch 1 until all osds boot - newmap.inc_epoch(); // = 1 - assert(newmap.get_epoch() == 1); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, 1.0); - //cerr << "osd" << i << " in domain " << dom << endl; - i++; - if (i == g_conf.num_osd) break; - } - } - - // root - Bucket *root = new ListBucket(2); - for (int i=0; iget_weight() << endl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = newmap.crush.add_bucket(root); - - // rules - // replication - for (int i=1; i<=ndom; i++) { - int r = CRUSH_REP_RULE(i); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - // raid - for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { - int r = CRUSH_RAID_RULE(i); - if (ndom >= i) { - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } else { - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - - // test - //vector out; - //newmap.pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = newmap.crush.add_bucket(b); - for (int i=0; iadd_item(i, 1.0); - } - - // rules - // replication - for (int i=1; i<=g_conf.osd_max_rep; i++) { - int r = CRUSH_REP_RULE(i); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - // raid - for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { - int r = CRUSH_RAID_RULE(i); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - - if (g_conf.mds_local_osd) { - // add mds osds, but don't put them in the crush mapping func - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } - - // encode into pending incremental - newmap.encode(pending_inc.fullmap); -} - -bool OSDMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - dout(15) << "update_from_paxos paxos e " << paxosv - << ", my e " << osdmap.epoch << endl; - - if (paxosv == osdmap.epoch) return true; - assert(paxosv >= osdmap.epoch); - - if (osdmap.epoch == 0 && paxosv > 1) { - // startup: just load latest full map - epoch_t lastfull = mon->store->get_int("osdmap_full","last_epoch"); - if (lastfull) { - dout(7) << "update_from_paxos startup: loading latest full map e" << lastfull << endl; - bufferlist bl; - mon->store->get_bl_sn(bl, "osdmap_full", lastfull); - osdmap.decode(bl); - } - } - - // walk through incrementals - while (paxosv > osdmap.epoch) { - bufferlist bl; - bool success = paxos->read(osdmap.epoch+1, bl); - assert(success); - - dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << endl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - osdmap.apply_incremental(inc); - - // write out the full map, too. - bl.clear(); - osdmap.encode(bl); - mon->store->put_bl_sn(bl, "osdmap_full", osdmap.epoch); - } - mon->store->put_int(osdmap.epoch, "osdmap_full","last_epoch"); - - // new map! - bcast_latest_mds(); - - return true; -} - - -void OSDMonitor::create_pending() -{ - pending_inc = OSDMap::Incremental(osdmap.epoch+1); - dout(10) << "create_pending e " << pending_inc.epoch - << endl; -} - -void OSDMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e " << pending_inc.epoch - << endl; - - // finish up pending_inc - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // tell me about it - for (map::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(0) << " osd" << i->first << " DOWN " << i->second << endl; - derr(0) << " osd" << i->first << " DOWN " << i->second << endl; - mon->messenger->mark_down(i->second.addr); - } - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(0) << " osd" << i->first << " UP " << i->second << endl; - derr(0) << " osd" << i->first << " UP " << i->second << endl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(0) << " osd" << *i << " OUT" << endl; - derr(0) << " osd" << *i << " OUT" << endl; - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(0) << " osd" << *i << " IN" << endl; - derr(0) << " osd" << *i << " IN" << endl; - } - - // encode - assert(paxos->get_version() + 1 == pending_inc.epoch); - pending_inc.encode(bl); -} - - -// ------------- - -bool OSDMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << endl; - - switch (m->get_type()) { - // READs - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - return true; - - // damp updates - case MSG_OSD_FAILURE: - return preprocess_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return preprocess_boot((MOSDBoot*)m); - /* - case MSG_OSD_IN: - return preprocess_in((MOSDIn*)m); - case MSG_OSD_OUT: - return preprocess_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - return true; - } -} - -bool OSDMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << endl; - - switch (m->get_type()) { - // damp updates - case MSG_OSD_FAILURE: - return prepare_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return prepare_boot((MOSDBoot*)m); - - /* - case MSG_OSD_IN: - return prepare_in((MOSDIn*)m); - case MSG_OSD_OUT: - return prepare_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - } - - return false; -} - -bool OSDMonitor::should_propose_now() -{ - // don't propose initial map until _all_ osds boot. - //dout(10) << "should_propose_now " << pending_inc.new_up.size() << " vs " << osdmap.get_osds().size() << endl; - if (osdmap.epoch == 1 && - pending_inc.new_up.size() < osdmap.get_osds().size()) - return false; // not all up (yet) - - // FIXME do somethihng smart here. - return true; -} - - - -// --------------------------- -// READs - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; - - //if (m->get_since()) - send_incremental(m->get_since(), m->get_source_inst()); - //else - //send_full(m->get_source_inst()); - - delete m; -} - - - -// --------------------------- -// UPDATEs - -// failure -- - -bool OSDMonitor::preprocess_failure(MOSDFailure *m) -{ - int badboy = m->get_failed().name.num(); - - // weird? - if (!osdmap.have_inst(badboy)) { - dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_from() << endl; - send_incremental(m->get_epoch(), m->get_from()); - return true; - } - if (osdmap.get_inst(badboy) != m->get_failed()) { - dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy) - << ", from " << m->get_from() << endl; - send_incremental(m->get_epoch(), m->get_from()); - return true; - } - // already reported? - if (osdmap.is_down(badboy)) { - dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_from() << endl; - send_incremental(m->get_epoch(), m->get_from()); - return true; - } - - dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_from() << endl; - return false; -} - -bool OSDMonitor::prepare_failure(MOSDFailure *m) -{ - dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_from() << endl; - - // FIXME - // take their word for it - int badboy = m->get_failed().name.num(); - assert(osdmap.is_up(badboy)); - assert(osdmap.osd_inst[badboy] == m->get_failed()); - - pending_inc.new_down[badboy] = m->get_failed(); - - if (osdmap.is_in(badboy)) - down_pending_out[badboy] = g_clock.now(); - - paxos->wait_for_commit(new C_Reported(this, m)); - - return true; -} - -void OSDMonitor::_reported_failure(MOSDFailure *m) -{ - dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_from() << endl; - send_latest(m->get_from(), m->get_epoch()); -} - - -// boot -- - -bool OSDMonitor::preprocess_boot(MOSDBoot *m) -{ - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // already booted? - if (osdmap.is_up(from) && - osdmap.get_inst(from) == m->inst) { - // yup. - dout(7) << "preprocess_boot dup from " << m->inst << endl; - _booted(m); - return true; - } - - dout(10) << "preprocess_boot from " << m->inst << endl; - return false; -} - -bool OSDMonitor::prepare_boot(MOSDBoot *m) -{ - dout(7) << "prepare_boot from " << m->inst << endl; - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // does this osd exist? - if (!osdmap.exists(from)) { - dout(1) << "boot from non-existent osd" << from << endl; - delete m; - return true; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - assert(osdmap.get_inst(from) != m->inst); // preproces should have caught it - - // mark previous guy down - pending_inc.new_down[from] = osdmap.osd_inst[from]; - } - - // mark new guy up. - down_pending_out.erase(from); // if any - pending_inc.new_up[from] = m->inst; - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - // wait - paxos->wait_for_commit(new C_Booted(this, m)); - - return true; -} - -void OSDMonitor::_booted(MOSDBoot *m) -{ - dout(7) << "_booted " << m->inst << endl; - send_latest(m->inst, m->sb.current_epoch); - delete m; -} - - -// in -- - -/* -void OSDMonitor::handle_osd_in(MOSDIn *m) -{ - dout(7) << "osd_in from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (osdmap.is_out(from)) - pending_inc.new_in.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); -} - -void OSDMonitor::handle_osd_out(MOSDOut *m) -{ - dout(7) << "osd_out from " << m->get_source() << endl; - int from = m->get_source().num(); - if (osdmap.is_in(from)) { - pending_inc.new_out.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); - } -} -*/ - - - -// --------------- -// map helpers - -void OSDMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << osdmap.get_epoch() << endl; - - for (map >::iterator i = awaiting_map.begin(); - i != awaiting_map.end(); - i++) { - if (i->second.second) - send_incremental(i->second.second, i->second.first); - else - send_full(i->second.first); - } -} - -void OSDMonitor::send_latest(entity_inst_t who, epoch_t since) -{ - if (paxos->is_readable()) { - dout(5) << "send_latest to " << who << " now" << endl; - if (since == (epoch_t)(-1)) - send_full(who); - else - send_incremental(since, who); - } else { - dout(5) << "send_latest to " << who << " later" << endl; - awaiting_map[who.name].first = who; - awaiting_map[who.name].second = since; - } -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - dout(5) << "send_full to " << who << endl; - mon->messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(epoch_t since, entity_inst_t dest) -{ - dout(5) << "send_incremental " << since << " -> " << osdmap.get_epoch() - << " to " << dest << endl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e > since; - e--) { - bufferlist bl; - if (mon->store->get_bl_sn(bl, "osdmap", e) > 0) { - dout(20) << "send_incremental inc " << e << " " << bl.length() << " bytes" << endl; - m->incremental_maps[e] = bl; - } - else if (mon->store->get_bl_sn(bl, "osdmap_full", e) > 0) { - dout(20) << "send_incremental full " << e << endl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - mon->messenger->send_message(m, dest); -} - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << endl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(osdmap.get_epoch()-1, mon->mdsmon->mdsmap.get_inst(*i)); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << endl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(*it)); - } -} - -void OSDMonitor::bcast_full_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_full_osd epoch " << e << endl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - send_full(osdmap.get_inst(*it)); - } -} - - -// TICK - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - propose_pending(); - } -} - - - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -void OSDMonitor::mark_all_down() -{ - assert(mon->is_leader()); - - dout(7) << "mark_all_down" << endl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it] = osdmap.get_inst(*it); - } - - propose_pending(); -} - - - - - - - - - - - - - - - -/* - - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << endl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << endl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap_full", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - -} - - - -*/ diff --git a/branches/sage/pgs/mon/OSDMonitor.h b/branches/sage/pgs/mon/OSDMonitor.h deleted file mode 100644 index 59424a6fbe9e8..0000000000000 --- a/branches/sage/pgs/mon/OSDMonitor.h +++ /dev/null @@ -1,124 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -#include "PaxosService.h" - -class Monitor; -class MOSDBoot; - -class OSDMonitor : public PaxosService { -public: - OSDMap osdmap; - -private: - map > awaiting_map; - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - // svc - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); - - void handle_query(Message *m); - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose_now(); - - // ... - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void send_to_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(epoch_t since, entity_inst_t dest); - void bcast_latest_mds(); - void bcast_latest_osd(); - void bcast_full_osd(); - - void handle_osd_getmap(class MOSDGetMap *m); - - bool preprocess_failure(class MOSDFailure *m); - bool prepare_failure(class MOSDFailure *m); - void _reported_failure(MOSDFailure *m); - - bool preprocess_boot(class MOSDBoot *m); - bool prepare_boot(class MOSDBoot *m); - void _booted(MOSDBoot *m); - - class C_Booted : public Context { - OSDMonitor *cmon; - MOSDBoot *m; - public: - C_Booted(OSDMonitor *cm, MOSDBoot *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_booted(m); - else - cmon->dispatch((Message*)m); - } - }; - class C_Reported : public Context { - OSDMonitor *cmon; - MOSDFailure *m; - public: - C_Reported(OSDMonitor *cm, MOSDFailure *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_reported_failure(m); - else - cmon->dispatch((Message*)m); - } - }; - - bool preprocess_in(class MOSDIn *m); - bool prepare_in(class MOSDIn *m); - - bool preprocess_out(class MOSDOut *m); - bool prepare_out(class MOSDOut *m); - - public: - OSDMonitor(Monitor *mn, Paxos *p) : - PaxosService(mn, p) { } - - void tick(); // check state, take actions - - void mark_all_down(); - - void send_latest(entity_inst_t i, epoch_t since=(epoch_t)(-1)); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/branches/sage/pgs/mon/PGMap.h b/branches/sage/pgs/mon/PGMap.h deleted file mode 100644 index dc6b500111df0..0000000000000 --- a/branches/sage/pgs/mon/PGMap.h +++ /dev/null @@ -1,30 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMAP_H -#define __PGMAP_H - -#include "osd/osd_types.h" - -class PGMap { - -public: - class Incremental { - - }; - - -}; - -#endif diff --git a/branches/sage/pgs/mon/PGMonitor.cc b/branches/sage/pgs/mon/PGMonitor.cc deleted file mode 100644 index 8280b87df3e9d..0000000000000 --- a/branches/sage/pgs/mon/PGMonitor.cc +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "PGMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " - - - -void PGMonitor::create_initial() -{ -} - -bool PGMonitor::update_from_paxos() -{ - return true; -} - -void PGMonitor::create_pending() -{ - -} - -void PGMonitor::encode_pending(bufferlist &bl) -{ - -} - -bool PGMonitor::preprocess_query(Message *m) -{ - return true; -} - -bool PGMonitor::prepare_update(Message *m) -{ - return true; -} diff --git a/branches/sage/pgs/mon/PGMonitor.h b/branches/sage/pgs/mon/PGMonitor.h deleted file mode 100644 index 917d6e272a756..0000000000000 --- a/branches/sage/pgs/mon/PGMonitor.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMONITOR_H -#define __PGMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" -#include "PaxosService.h" - -#include "PGMap.h" - -class PGMonitor : public PaxosService { -public: - - -private: - PGMap pg_map; - PGMap::Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - - public: - PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/pgs/mon/Paxos.cc b/branches/sage/pgs/mon/Paxos.cc deleted file mode 100644 index 0ecf0f5a6caf8..0000000000000 --- a/branches/sage/pgs/mon/Paxos.cc +++ /dev/null @@ -1,784 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) cerr << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) cout << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " - - -void Paxos::init() -{ - // load paxos variables from stable storage - last_pn = mon->store->get_int(machine_name, "last_pn"); - accepted_pn = mon->store->get_int(machine_name, "accepted_pn"); - last_committed = mon->store->get_int(machine_name, "last_committed"); - - dout(10) << "init" << endl; -} - -// --------------------------------- - -// PHASE 1 - -// leader -void Paxos::collect(version_t oldpn) -{ - // we're recoverying, it seems! - state = STATE_RECOVERING; - assert(mon->is_leader()); - - // reset the number of lasts received - uncommitted_v = 0; - uncommitted_pn = 0; - uncommitted_value.clear(); - - // look for uncommitted value - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; - mon->store->get_bl_sn(uncommitted_value, machine_name, last_committed+1); - dout(10) << "learned uncommitted " << (last_committed+1) - << " (" << uncommitted_value.length() << " bytes) from myself" - << endl; - } - - // pick new pn - accepted_pn = get_new_proposal_number(MAX(accepted_pn, oldpn)); - accepted_pn_from = last_committed; - num_last = 1; - dout(10) << "collect with pn " << accepted_pn << endl; - - // send collect - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id); - collect->last_committed = last_committed; - collect->pn = accepted_pn; - mon->messenger->send_message(collect, mon->monmap->get_inst(*p)); - } - -} - - -// peon -void Paxos::handle_collect(MMonPaxos *collect) -{ - dout(10) << "handle_collect " << *collect << endl; - - assert(mon->is_peon()); // mon epoch filter should catch strays - - // we're recoverying, it seems! - state = STATE_RECOVERING; - - // reply - MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, machine_id); - last->last_committed = last_committed; - - // do we have an accepted but uncommitted value? - // (it'll be at last_committed+1) - bufferlist bl; - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - mon->store->get_bl_sn(bl, machine_name, last_committed+1); - assert(bl.length() > 0); - dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 - << " (" << bl.length() << " bytes)" << endl; - last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; - } - - // can we accept this pn? - if (collect->pn > accepted_pn) { - // ok, accept it - accepted_pn = collect->pn; - accepted_pn_from = collect->pn_from; - dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << endl; - mon->store->put_int(accepted_pn, machine_name, "accepted_pn"); - } else { - // don't accept! - dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from - << ", we already accepted " << accepted_pn << " from " << accepted_pn_from - << endl; - } - last->pn = accepted_pn; - last->pn_from = accepted_pn_from; - - // and share whatever data we have - for (version_t v = collect->last_committed+1; - v <= last_committed; - v++) { - if (mon->store->exists_bl_sn(machine_name, v)) { - mon->store->get_bl_sn(last->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << last->values[v].length() << " bytes)" << endl; - } - } - - // send reply - mon->messenger->send_message(last, collect->get_source_inst()); - delete collect; -} - - -// leader -void Paxos::handle_last(MMonPaxos *last) -{ - dout(10) << "handle_last " << *last << endl; - - if (!mon->is_leader()) { - dout(10) << "not leader, dropping" << endl; - delete last; - return; - } - - // share committed values? - if (last->last_committed < last_committed) { - // share committed values - dout(10) << "sending commit to " << last->get_source() << endl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - for (version_t v = last->last_committed+1; - v <= last_committed; - v++) { - mon->store->get_bl_sn(commit->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << commit->values[v].length() << " bytes)" << endl; - } - commit->last_committed = last_committed; - mon->messenger->send_message(commit, last->get_source_inst()); - } - - // did we receive a committed value? - if (last->last_committed > last_committed) { - for (version_t v = last_committed+1; - v <= last->last_committed; - v++) { - mon->store->put_bl_sn(last->values[v], machine_name, v); - dout(10) << "committing " << v << " " - << last->values[v].length() << " bytes" << endl; - } - last_committed = last->last_committed; - mon->store->put_int(last_committed, machine_name, "last_committed"); - dout(10) << "last_committed now " << last_committed << endl; - } - - // do they accept your pn? - if (last->pn > accepted_pn) { - // no, try again. - dout(10) << " they had a higher pn than us, picking a new one." << endl; - collect(last->pn); - } else { - // yes, they accepted our pn. great. - num_last++; - dout(10) << " they accepted our pn, we now have " - << num_last << " peons" << endl; - - // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << endl; - } - - // is that everyone? - if (num_last == mon->get_quorum().size()) { - // almost... - state = STATE_ACTIVE; - - // did we learn an old value? - if (uncommitted_v == last_committed+1 && - uncommitted_value.length()) { - dout(10) << "that's everyone. begin on old learned value" << endl; - begin(uncommitted_value); - } else { - // active! - dout(10) << "that's everyone. active!" << endl; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } - } - } - - delete last; -} - - -// leader -void Paxos::begin(bufferlist& v) -{ - dout(10) << "begin for " << last_committed+1 << " " - << v.length() << " bytes" - << endl; - - assert(mon->is_leader()); - assert(is_active()); - state = STATE_UPDATING; - - // we must already have a majority for this to work. - assert(mon->get_quorum().size() == 1 || - num_last > (unsigned)mon->monmap->num_mon/2); - - // and no value, yet. - assert(new_value.length() == 0); - - // accept it ourselves - accepted.clear(); - accepted.insert(whoami); - new_value = v; - mon->store->put_bl_sn(new_value, machine_name, last_committed+1); - - if (mon->get_quorum().size() == 1) { - // we're alone, take it easy - commit(); - state = STATE_ACTIVE; - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - return; - } - - // ask others to accept it to! - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending begin to mon" << *p << endl; - MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id); - begin->values[last_committed+1] = new_value; - begin->last_committed = last_committed; - begin->pn = accepted_pn; - - mon->messenger->send_message(begin, mon->monmap->get_inst(*p)); - } - - // set timeout event - accept_timeout_event = new C_AcceptTimeout(this); - mon->timer.add_event_after(g_conf.mon_accept_timeout, accept_timeout_event); -} - -// peon -void Paxos::handle_begin(MMonPaxos *begin) -{ - dout(10) << "handle_begin " << *begin << endl; - - // can we accept this? - if (begin->pn < accepted_pn) { - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << endl; - delete begin; - return; - } - assert(begin->pn == accepted_pn); - assert(begin->last_committed == last_committed); - - // set state. - state = STATE_UPDATING; - lease_expire = utime_t(); // cancel lease - - // yes. - version_t v = last_committed+1; - dout(10) << "accepting value for " << v << " pn " << accepted_pn << endl; - mon->store->put_bl_sn(begin->values[v], machine_name, v); - - // reply - MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id); - accept->pn = accepted_pn; - accept->last_committed = last_committed; - mon->messenger->send_message(accept, begin->get_source_inst()); - - delete begin; -} - -// leader -void Paxos::handle_accept(MMonPaxos *accept) -{ - dout(10) << "handle_accept " << *accept << endl; - int from = accept->get_source().num(); - - if (accept->pn != accepted_pn) { - // we accepted a higher pn, from some other leader - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << endl; - delete accept; - return; - } - if (last_committed > 0 && - accept->last_committed < last_committed-1) { - dout(10) << " this is from an old round, ignoring" << endl; - delete accept; - return; - } - assert(accept->last_committed == last_committed || // not committed - accept->last_committed == last_committed-1); // committed - - assert(state == STATE_UPDATING); - assert(accepted.count(from) == 0); - accepted.insert(from); - dout(10) << " now " << accepted << " have accepted" << endl; - - // new majority? - if (accepted.size() == (unsigned)mon->monmap->num_mon/2+1) { - // yay, commit! - // note: this may happen before the lease is reextended (below) - dout(10) << " got majority, committing" << endl; - commit(); - } - - // done? - if (accepted == mon->get_quorum()) { - dout(10) << " got quorum, done with update" << endl; - // cancel timeout event - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - - // yay! - state = STATE_ACTIVE; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } -} - -void Paxos::accept_timeout() -{ - dout(5) << "accept timeout, calling fresh election" << endl; - accept_timeout_event = 0; - assert(mon->is_leader()); - assert(is_updating()); - cancel_events(); - mon->call_election(); -} - -void Paxos::commit() -{ - dout(10) << "commit " << last_committed+1 << endl; - - // commit locally - last_committed++; - mon->store->put_int(last_committed, machine_name, "last_committed"); - - // tell everyone - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending commit to mon" << *p << endl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - commit->values[last_committed] = new_value; - commit->pn = accepted_pn; - - mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); - } - - // get ready for a new round. - new_value.clear(); -} - - -void Paxos::handle_commit(MMonPaxos *commit) -{ - dout(10) << "handle_commit on " << commit->last_committed << endl; - - if (!mon->is_peon()) { - dout(10) << "not a peon, dropping" << endl; - assert(0); - delete commit; - return; - } - - // commit locally. - for (map::iterator p = commit->values.begin(); - p != commit->values.end(); - ++p) { - assert(p->first == last_committed+1); - last_committed = p->first; - dout(10) << " storing " << last_committed << " (" << p->second.length() << " bytes)" << endl; - mon->store->put_bl_sn(p->second, machine_name, last_committed); - } - mon->store->put_int(last_committed, machine_name, "last_committed"); - - delete commit; -} - -void Paxos::extend_lease() -{ - assert(mon->is_leader()); - assert(is_active()); - - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - acked_lease.clear(); - acked_lease.insert(whoami); - - dout(7) << "extend_lease now+" << g_conf.mon_lease << " (" << lease_expire << ")" << endl; - - // bcast - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id); - lease->last_committed = last_committed; - lease->lease_expire = lease_expire; - mon->messenger->send_message(lease, mon->monmap->get_inst(*p)); - } - - // set timeout event. - // if old timeout is still in place, leave it. - if (!lease_ack_timeout_event) { - lease_ack_timeout_event = new C_LeaseAckTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_ack_timeout_event); - } - - // set renew event - lease_renew_event = new C_LeaseRenew(this); - utime_t at = lease_expire; - at -= g_conf.mon_lease; - at += g_conf.mon_lease_renew_interval; - mon->timer.add_event_at(at, lease_renew_event); -} - - -// peon -void Paxos::handle_lease(MMonPaxos *lease) -{ - // sanity - if (!mon->is_peon() || - last_committed != lease->last_committed) { - dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << endl; - delete lease; - return; - } - - // extend lease - if (lease_expire < lease->lease_expire) - lease_expire = lease->lease_expire; - - state = STATE_ACTIVE; - - dout(10) << "handle_lease on " << lease->last_committed - << " now " << lease_expire << endl; - - // ack - MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id); - ack->last_committed = last_committed; - ack->lease_expire = lease_expire; - mon->messenger->send_message(ack, lease->get_source_inst()); - - // (re)set timeout event. - if (lease_timeout_event) - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = new C_LeaseTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_timeout_event); - - // kick waiters - finish_contexts(waiting_for_active); - if (is_readable()) - finish_contexts(waiting_for_readable); - - delete lease; -} - -void Paxos::handle_lease_ack(MMonPaxos *ack) -{ - int from = ack->get_source().num(); - - if (!lease_ack_timeout_event) { - dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << endl; - } - else if (acked_lease.count(from) == 0) { - acked_lease.insert(from); - - if (acked_lease == mon->get_quorum()) { - // yay! - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- got everyone" << endl; - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- still need " - << mon->get_quorum().size() - acked_lease.size() - << " more" << endl; - } - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " dup (lagging!), ignoring" << endl; - } - - delete ack; -} - -void Paxos::lease_ack_timeout() -{ - dout(5) << "lease_ack_timeout -- calling new election" << endl; - assert(mon->is_leader()); - assert(is_active()); - - lease_ack_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_timeout() -{ - dout(5) << "lease_timeout -- calling new election" << endl; - assert(mon->is_peon()); - - lease_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_renew_timeout() -{ - lease_renew_event = 0; - extend_lease(); -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - if (last_pn < gt) - last_pn = gt; - - // update. make it unique among all monitors. - last_pn /= 100; - last_pn++; - last_pn *= 100; - last_pn += (version_t)whoami; - - // write - mon->store->put_int(last_pn, machine_name, "last_pn"); - - dout(10) << "get_new_proposal_number = " << last_pn << endl; - return last_pn; -} - - -void Paxos::cancel_events() -{ - if (accept_timeout_event) { - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - } - if (lease_renew_event) { - mon->timer.cancel_event(lease_renew_event); - lease_renew_event = 0; - } - if (lease_ack_timeout_event) { - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } - if (lease_timeout_event) { - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = 0; - } -} - -void Paxos::leader_init() -{ - if (mon->get_quorum().size() == 1) { - state = STATE_ACTIVE; - return; - } - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "leader_init -- starting paxos recovery" << endl; - collect(0); -} - -void Paxos::peon_init() -{ - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "peon_init -- i am a peon" << endl; - - // no chance to write now! - finish_contexts(waiting_for_writeable, -1); - finish_contexts(waiting_for_commit, -1); -} - -void Paxos::election_starting() -{ - dout(10) << "election_starting -- canceling timeouts" << endl; - cancel_events(); - new_value.clear(); - - finish_contexts(waiting_for_commit, -1); -} - - -void Paxos::dispatch(Message *m) -{ - // election in progress? - if (mon->is_starting()) { - dout(5) << "election in progress, dropping " << *m << endl; - delete m; - return; - } - - // check sanity - assert(mon->is_leader() || - (mon->is_peon() && m->get_source().num() == mon->get_leader())); - - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - case MMonPaxos::OP_COMMIT: - handle_commit(pm); - break; - case MMonPaxos::OP_LEASE: - handle_lease(pm); - break; - case MMonPaxos::OP_LEASE_ACK: - handle_lease_ack(pm); - break; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - -// ----------------- -// service interface - -// -- READ -- - -bool Paxos::is_readable() -{ - //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << endl; - return - (mon->is_peon() || mon->is_leader()) && - is_active() && - last_committed > 0 && // must have a value - (mon->get_quorum().size() == 1 || // alone, or - g_clock.now() < lease_expire); // have lease -} - -bool Paxos::read(version_t v, bufferlist &bl) -{ - if (!is_readable()) - return false; - - if (!mon->store->get_bl_sn(bl, machine_name, v)) - return false; - return true; -} - -version_t Paxos::read_current(bufferlist &bl) -{ - if (!is_readable()) - return 0; - if (read(last_committed, bl)) - return last_committed; - return 0; -} - - - - -// -- WRITE -- - -bool Paxos::is_writeable() -{ - if (mon->get_quorum().size() == 1) return true; - return - mon->is_leader() && - is_active() && - g_clock.now() < lease_expire; -} - -bool Paxos::propose_new_value(bufferlist& bl, Context *oncommit) -{ - /* - // writeable? - if (!is_writeable()) { - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" - << " -- not writeable" << endl; - if (oncommit) { - oncommit->finish(-1); - delete oncommit; - } - return false; - } - */ - - assert(mon->is_leader() && is_active()); - - // cancel lease renewal and timeout events. - cancel_events(); - - // ok! - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" << endl; - if (oncommit) - waiting_for_commit.push_back(oncommit); - begin(bl); - - return true; -} - diff --git a/branches/sage/pgs/mon/Paxos.h b/branches/sage/pgs/mon/Paxos.h deleted file mode 100644 index 403e6d6eeaf96..0000000000000 --- a/branches/sage/pgs/mon/Paxos.h +++ /dev/null @@ -1,250 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* -time----> - -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? leader -cccccccccccccccccc????????????????????????????????????????? -ccccc?????????????????????????????????????????????????????? - -last_committed - -pn_from -pn - -a 12v -b 12v -c 14v -d -e 12v - - -*/ - - -/* - * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways: - * 1- Only a single new value is generated at a time, simplifying the recovery logic. - * 2- Nodes track "committed" values, and share them generously (and trustingly) - * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to - * "read" their copy of the last committed value. - * - * This provides a simple replication substrate that services can be built on top of. - */ - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "mon_types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - - friend class PaxosService; - - // LEADER+PEON - - // -- generic state -- -public: - const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state - const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease - const static int STATE_UPDATING = 3; // leader|peon: updating to new value - const char *get_statename(int s) { - switch (s) { - case STATE_RECOVERING: return "recovering"; - case STATE_ACTIVE: return "active"; - case STATE_UPDATING: return "updating"; - default: assert(0); return 0; - } - } - -private: - int state; - -public: - bool is_recovering() { return state == STATE_RECOVERING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_updating() { return state == STATE_UPDATING; } - -private: - // recovery (phase 1) - version_t last_pn; - version_t last_committed; - version_t accepted_pn; - version_t accepted_pn_from; - - // active (phase 2) - utime_t lease_expire; - list waiting_for_active; - list waiting_for_readable; - - - // -- leader -- - // recovery (paxos phase 1) - unsigned num_last; - version_t uncommitted_v; - version_t uncommitted_pn; - bufferlist uncommitted_value; - - // active - set acked_lease; - Context *lease_renew_event; - Context *lease_ack_timeout_event; - Context *lease_timeout_event; - - // updating (paxos phase 2) - bufferlist new_value; - set accepted; - - Context *accept_timeout_event; - - list waiting_for_writeable; - list waiting_for_commit; - - class C_AcceptTimeout : public Context { - Paxos *paxos; - public: - C_AcceptTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->accept_timeout(); - } - }; - - class C_LeaseAckTimeout : public Context { - Paxos *paxos; - public: - C_LeaseAckTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_ack_timeout(); - } - }; - - class C_LeaseTimeout : public Context { - Paxos *paxos; - public: - C_LeaseTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_timeout(); - } - }; - - class C_LeaseRenew : public Context { - Paxos *paxos; - public: - C_LeaseRenew(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_renew_timeout(); - } - }; - - - void collect(version_t oldpn); - void handle_collect(MMonPaxos*); - void handle_last(MMonPaxos*); - void begin(bufferlist& value); - void handle_begin(MMonPaxos*); - void handle_accept(MMonPaxos*); - void accept_timeout(); - void commit(); - void handle_commit(MMonPaxos*); - void extend_lease(); - void handle_lease(MMonPaxos*); - void handle_lease_ack(MMonPaxos*); - - void lease_ack_timeout(); // on leader, if lease isn't acked by all peons - void lease_renew_timeout(); // on leader, to renew the lease - void lease_timeout(); // on peon, if lease isn't extended - - void cancel_events(); - - version_t get_new_proposal_number(version_t gt=0); - -public: - Paxos(Monitor *m, int w, - int mid) : mon(m), whoami(w), - machine_id(mid), - machine_name(get_paxos_name(mid)), - state(STATE_RECOVERING), - lease_renew_event(0), - lease_ack_timeout_event(0), - lease_timeout_event(0), - accept_timeout_event(0) { } - - void dispatch(Message *m); - - void init(); - - void election_starting(); - void leader_init(); - void peon_init(); - - - // -- service interface -- - void wait_for_active(Context *c) { - assert(!is_active()); - waiting_for_active.push_back(c); - } - - // read - version_t get_version() { return last_committed; } - bool is_readable(); - bool read(version_t v, bufferlist &bl); - version_t read_current(bufferlist &bl); - void wait_for_readable(Context *onreadable) { - assert(!is_readable()); - waiting_for_readable.push_back(onreadable); - } - - // write - bool is_leader(); - bool is_writeable(); - void wait_for_writeable(Context *c) { - assert(!is_writeable()); - waiting_for_writeable.push_back(c); - } - - bool propose_new_value(bufferlist& bl, Context *oncommit=0); - void wait_for_commit(Context *oncommit) { - waiting_for_commit.push_back(oncommit); - } - void wait_for_commit_front(Context *oncommit) { - waiting_for_commit.push_front(oncommit); - } - -}; - - - -#endif - diff --git a/branches/sage/pgs/mon/PaxosService.cc b/branches/sage/pgs/mon/PaxosService.cc deleted file mode 100644 index 6f4fba2d6c27d..0000000000000 --- a/branches/sage/pgs/mon/PaxosService.cc +++ /dev/null @@ -1,136 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "PaxosService.h" -#include "common/Clock.h" -#include "Monitor.h" - - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") " -//#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << "." << get_paxos_name(paxos->machine_id) << " " - - - - -void PaxosService::dispatch(Message *m) -{ - dout(10) << "dispatch " << *m << " from " << m->get_source_inst() << endl; - - // make sure our map is readable and up to date - if (!paxos->is_readable()) { - dout(10) << " waiting for paxos -> readable" << endl; - paxos->wait_for_readable(new C_RetryMessage(this, m)); - return; - } - - // make sure service has latest from paxos. - update_from_paxos(); - - // preprocess - if (preprocess_query(m)) - return; // easy! - - // leader? - if (!mon->is_leader()) { - // fw to leader - dout(10) << " fw to leader mon" << mon->get_leader() << endl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return; - } - - // writeable? - if (!paxos->is_writeable()) { - dout(10) << " waiting for paxos -> writeable" << endl; - paxos->wait_for_writeable(new C_RetryMessage(this, m)); - return; - } - - // update - if (prepare_update(m) && - should_propose_now()) - propose_pending(); -} - -void PaxosService::_commit() -{ - dout(7) << "_commit" << endl; - update_from_paxos(); // notify service of new paxos state - - if (mon->is_leader()) { - dout(7) << "_commit creating new pending" << endl; - assert(have_pending == false); - create_pending(); - have_pending = true; - } -} - - -void PaxosService::propose_pending() -{ - dout(10) << "propose_pending" << endl; - assert(have_pending); - - // finish and encode - bufferlist bl; - encode_pending(bl); - have_pending = false; - - // apply to paxos - paxos->wait_for_commit_front(new C_Commit(this)); - paxos->propose_new_value(bl); -} - - -void PaxosService::election_finished() -{ - dout(10) << "election_finished" << endl; - - if (have_pending && - !mon->is_leader()) { - discard_pending(); - have_pending = false; - } - - // make sure we update our state - if (paxos->is_active()) - _active(); - else - paxos->wait_for_active(new C_Active(this)); -} - -void PaxosService::_active() -{ - dout(10) << "_active" << endl; - assert(paxos->is_active()); - - // pull latest from paxos - update_from_paxos(); - - // create pending state? - if (mon->is_leader()) { - if (!have_pending) { - create_pending(); - have_pending = true; - } - - if (g_conf.mkfs && - paxos->get_version() == 0) { - create_initial(); - propose_pending(); - } - } -} diff --git a/branches/sage/pgs/mon/PaxosService.h b/branches/sage/pgs/mon/PaxosService.h deleted file mode 100644 index 32bcb3e4b11fb..0000000000000 --- a/branches/sage/pgs/mon/PaxosService.h +++ /dev/null @@ -1,91 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PAXOSSERVICE_H -#define __PAXOSSERVICE_H - -#include "msg/Dispatcher.h" -#include "include/Context.h" - -class Monitor; -class Paxos; - -class PaxosService : public Dispatcher { -protected: - Monitor *mon; - Paxos *paxos; - - class C_RetryMessage : public Context { - PaxosService *svc; - Message *m; - public: - C_RetryMessage(PaxosService *s, Message *m_) : svc(s), m(m_) {} - void finish(int r) { - svc->dispatch(m); - } - }; - class C_Active : public Context { - PaxosService *svc; - public: - C_Active(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_active(); - } - }; - class C_Commit : public Context { - PaxosService *svc; - public: - C_Commit(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_commit(); - } - }; - friend class C_Update; - -private: - bool have_pending; - -public: - PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), - have_pending(false) { } - - // i implement and you ignore - void dispatch(Message *m); - void election_finished(); - -private: - void _active(); - void _commit(); - -public: - // i implement and you use - void propose_pending(); // propose current pending as new paxos state - - // you implement - virtual bool update_from_paxos() = 0; // assimilate latest paxos state - virtual void create_pending() = 0; // [leader] create new pending structures - virtual void create_initial() = 0; // [leader] populate pending with initial state (1) - virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state - virtual void discard_pending() { } // [leader] discard pending - - virtual bool preprocess_query(Message *m) = 0; // true if processed (e.g., read-only) - virtual bool prepare_update(Message *m) = 0; - virtual bool should_propose_now() { return true; } - -}; - -#endif - diff --git a/branches/sage/pgs/mon/mon_types.h b/branches/sage/pgs/mon/mon_types.h deleted file mode 100644 index 8d1ac92822356..0000000000000 --- a/branches/sage/pgs/mon/mon_types.h +++ /dev/null @@ -1,35 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_TYPES_H -#define __MON_TYPES_H - -#define PAXOS_TEST 0 -#define PAXOS_MDSMAP 1 -#define PAXOS_OSDMAP 2 -#define PAXOS_CLIENTMAP 3 -#define PAXOS_PGMAP 4 - -inline const char *get_paxos_name(int p) { - switch (p) { - case PAXOS_TEST: return "test"; - case PAXOS_MDSMAP: return "mdsmap"; - case PAXOS_OSDMAP: return "osdmap"; - case PAXOS_CLIENTMAP: return "clientmap"; - case PAXOS_PGMAP: return "pgmap"; - default: assert(0); return 0; - } -} - -#endif diff --git a/branches/sage/pgs/msg/Dispatcher.cc b/branches/sage/pgs/msg/Dispatcher.cc deleted file mode 100644 index 4fa04d7d4c92a..0000000000000 --- a/branches/sage/pgs/msg/Dispatcher.cc +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/branches/sage/pgs/msg/Dispatcher.h b/branches/sage/pgs/msg/Dispatcher.h deleted file mode 100644 index 0a77de3d20369..0000000000000 --- a/branches/sage/pgs/msg/Dispatcher.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/branches/sage/pgs/msg/FakeMessenger.cc b/branches/sage/pgs/msg/FakeMessenger.cc deleted file mode 100644 index 62e347b02c89d..0000000000000 --- a/branches/sage/pgs/msg/FakeMessenger.cc +++ /dev/null @@ -1,409 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#undef dout -#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int nranks = 0; // this identify each entity_inst_t - -map directory; -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - -extern std::map g_fake_kill_after; // in config.cc -utime_t start_time; -map fail_queue; -list sent_to_failed_queue; - -void *fakemessenger_thread(void *ptr) -{ - start_time = g_clock.now(); - - lock.Lock(); - while (1) { - if (fm_shutdown) break; - fakemessenger_do_loop_2(); - - if (directory.empty()) break; - - dout(20) << "thread waiting" << endl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << endl; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << endl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - cout << "fakemessenger_stopthread setting stop flag" << endl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - cout << "fakemessenger_wait waiting" << endl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - -// fake failure - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << endl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << endl; - - // fail_queue - while (!fail_queue.empty() && - fail_queue.begin()->first < g_clock.now()) { - entity_name_t nm = fail_queue.begin()->second; - fail_queue.erase(fail_queue.begin()); - - dout(0) << "MUST FAKE KILL " << nm << endl; - - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - if (p->second->get_myname() == nm) { - dout(0) << "FAKING FAILURE of " << nm << " at " << p->first << endl; - directory.erase(p); - p->second->failed = true; - break; - } - } - } - - list ls; - ls.swap(sent_to_failed_queue); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - Message *m = *p; - FakeMessenger *mgr = directory[m->get_source_addr()]; - Dispatcher *dis = 0; - if (mgr) dis = mgr->get_dispatcher(); - if (dis) { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", passing back to sender." << endl; - dis->ms_handle_failure(m, m->get_dest_inst()); - } else { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", sender gone, dropping." << endl; - delete m; - } - } - - // messages - map::iterator it = directory.begin(); - while (it != directory.end()) { - FakeMessenger *mgr = it->second; - - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl; - it++; - continue; - } - - Message *m = mgr->get_message(); - it++; - - if (m) { - //dout(18) << "got " << m << endl; - dout(1) << "==== " << m->get_dest() - << " <- " << m->get_source() - << " ==== " << *m - << " ---- " << m - << endl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. delayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << endl; - assert(directory.count(*it)); - directory.erase(*it); - if (directory.empty()) { - dout(1) << "fakemessenger: last shutdown" << endl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << endl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - failed = false; - - lock.Lock(); - { - // assign rank - _myinst.name = me; - _myinst.addr.port = nranks++; - //if (!me.is_mon()) - //_myinst.addr.nonce = getpid(); - - // add to directory - directory[ _myinst.addr ] = this; - - // put myself in the fail queue? - if (g_fake_kill_after.count(me)) { - utime_t w = start_time; - w += g_fake_kill_after[me]; - dout(0) << "will fake failure of " << me << " at " << w << endl; - fail_queue[w] = me; - } - } - lock.Unlock(); - - - cout << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << endl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; - lock.Lock(); - assert(directory.count(_myinst.addr) == 1); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << endl; - _set_myname(m); - - directory.erase(_myinst.addr); - _myinst.name = m; - directory[_myinst.addr] = this; - - // put myself in the fail queue? - if (g_fake_kill_after.count(m)) { - utime_t w = start_time; - w += g_fake_kill_after[m]; - dout(0) << "will fake failure of " << m << " at " << w << endl; - fail_queue[w] = m; - } - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname(), fromport); - m->set_source_addr(get_myaddr()); - - m->set_dest(inst.name, port); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - if (directory.count(inst.addr) && - shutdown_set.count(inst.addr) == 0) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << endl; - directory[inst.addr]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m - << " *** destination DNE ***" - << endl; - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - dout(20) << "** have " << p->first << " to " << p->second << endl; - } - - // do the failure callback - sent_to_failed_queue.push_back(m); - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << endl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/branches/sage/pgs/msg/FakeMessenger.h b/branches/sage/pgs/msg/FakeMessenger.h deleted file mode 100644 index ae622e8e58dbd..0000000000000 --- a/branches/sage/pgs/msg/FakeMessenger.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - entity_inst_t _myinst; - - public: - bool failed; - - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - const entity_inst_t& get_myinst() { - return _myinst; - }; - const entity_addr_t& get_myaddr() { - return _myinst.addr; - } - - void reset_myname(entity_name_t m); - - // msg interface - virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - - // events - //virtual void trigger_timer(Timer *t); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - - void suicide() { - if (!failed) { - failed = true; - } - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/branches/sage/pgs/msg/HostMonitor.cc b/branches/sage/pgs/msg/HostMonitor.cc deleted file mode 100644 index 969edadd424d6..0000000000000 --- a/branches/sage/pgs/msg/HostMonitor.cc +++ /dev/null @@ -1,236 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "HostMonitor.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MFailure.h" -#include "messages/MFailureAck.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#define DBL 10 - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: " - - -// timer contexts - -class C_HM_InitiateHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_InitiateHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "HEARTBEAT" << endl; - hm->pending_events.erase(this); - hm->initiate_heartbeat(); - } -}; - -class C_HM_CheckHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_CheckHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "CHECK" << endl; - hm->pending_events.erase(this); - hm->check_heartbeat(); - } -}; - - - -// startup/shutdown - -void HostMonitor::init() -{ - dout(DBL) << "init" << endl; - - // hack params for now - heartbeat_interval = 10; - max_ping_time = 2; - max_heartbeat_misses = 3; - notify_retry_interval = 10; - - // schedule first hb - schedule_heartbeat(); -} - - -void HostMonitor::shutdown() -{ - // cancel any events - for (set::iterator it = pending_events.begin(); - it != pending_events.end(); - it++) { - g_timer.cancel_event(*it); - delete *it; - } - pending_events.clear(); -} - - -// schedule next heartbeat - -void HostMonitor::schedule_heartbeat() -{ - dout(DBL) << "schedule_heartbeat" << endl; - Context *e = new C_HM_InitiateHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(heartbeat_interval, e); -} - - -// take note of a live host - -void HostMonitor::host_is_alive(entity_name_t host) -{ - if (hosts.count(host)) - status[host].last_heard_from = g_clock.gettime(); -} - - -// do heartbeat - -void HostMonitor::initiate_heartbeat() -{ - time_t now = g_clock.gettime(); - - // send out pings - inflight_pings.clear(); - for (set::iterator it = hosts.begin(); - it != hosts.end(); - it++) { - // have i heard from them recently? - if (now - status[*it].last_heard_from < heartbeat_interval) { - dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl; - } else { - dout(DBL) << "pinging " << *it << endl; - status[*it].last_pinged = now; - inflight_pings.insert(*it); - - messenger->send_message(new MPing(1), *it, 0); - } - } - - // set timer to check results - Context *e = new C_HM_CheckHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(max_ping_time, e); - dout(10) << "scheduled check " << e << endl; - - schedule_heartbeat(); // schedule next heartbeat -} - - -// check results - -void HostMonitor::check_heartbeat() -{ - dout(DBL) << "check_heartbeat()" << endl; - - // check inflight pings - for (set::iterator it = inflight_pings.begin(); - it != inflight_pings.end(); - it++) { - status[*it].num_heartbeats_missed++; - - dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl; - - if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) { - if (acked_failures.count(*it)) { - dout(DBL) << *it << " is already failed" << endl; - } else { - if (unacked_failures.count(*it)) { - dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl; - } else { - dout(DBL) << "failing " << *it << endl; - unacked_failures.insert(*it); - } - - /*if (false) // do this in NewMessenger for now! FIXME - for (set::iterator nit = notify.begin(); - nit != notify.end(); - nit++) { - messenger->send_message(new MFailure(*it, messenger->get_inst(*it)), - *nit, notify_port, 0); - } - */ - } - } - } - - // forget about the pings. - inflight_pings.clear(); -} - - -// incoming messages - -void HostMonitor::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_FAILURE_ACK: - handle_failure_ack((MFailureAck*)m); - break; - - } -} - -void HostMonitor::handle_ping_ack(MPingAck *m) -{ - entity_name_t from = m->get_source(); - - dout(DBL) << "ping ack from " << from << endl; - status[from].last_pinged = g_clock.gettime(); - status[from].num_heartbeats_missed = 0; - inflight_pings.erase(from); - - delete m; -} - -void HostMonitor::handle_failure_ack(MFailureAck *m) -{ - - // FIXME: this doesn't handle failed -> alive transitions gracefully at all.. - - // the higher-up's acknowledged our failure notification, we can stop resending it. - entity_name_t failed = m->get_failed(); - dout(DBL) << "handle_failure_ack " << failed << endl; - unacked_failures.erase(failed); - acked_failures.insert(failed); - - delete m; -} - - diff --git a/branches/sage/pgs/msg/HostMonitor.h b/branches/sage/pgs/msg/HostMonitor.h deleted file mode 100644 index 35334b7f6a61f..0000000000000 --- a/branches/sage/pgs/msg/HostMonitor.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __HOSTMONITOR_H -#define __HOSTMONITOR_H - -#include - -#include -#include -using namespace std; - -#include "include/Context.h" -#include "msg/Message.h" - -class Message; -class Messenger; - -typedef struct { - time_t last_heard_from; - time_t last_pinged; - int num_heartbeats_missed; -} monitor_rec_t; - -class HostMonitor { - Messenger *messenger; - string whoami; - - // hosts i monitor - set hosts; - - // who i tell when they fail - set notify; - int notify_port; - - // their status - map status; - - set inflight_pings; // pings we sent that haven't replied yet - - set unacked_failures; // failed hosts that haven't been acked yet. - set acked_failures; // these failures have been acked. - - float heartbeat_interval; // how often to do a heartbeat - float max_ping_time; // how long before it's a miss - int max_heartbeat_misses; // how many misses before i tell - float notify_retry_interval; // how often to retry failure notification - - public: - set pending_events; - - private: - void schedule_heartbeat(); - - public: - HostMonitor(Messenger *m, string& whoami) { - this->messenger = m; - this->whoami = whoami; - notify_port = 0; - } - set& get_hosts() { return hosts; } - set& get_notify() { return notify; } - void set_notify_port(int p) { notify_port = p; } - - void remove_host(entity_name_t h) { - hosts.erase(h); - status.erase(h); - unacked_failures.erase(h); - acked_failures.erase(h); - } - - void init(); - void shutdown(); - - void host_is_alive(entity_name_t who); - - void proc_message(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_failure_ack(class MFailureAck *m); - - void initiate_heartbeat(); - void check_heartbeat(); - -}; - -#endif diff --git a/branches/sage/pgs/msg/Message.cc b/branches/sage/pgs/msg/Message.cc deleted file mode 100644 index d6363a4c2ad11..0000000000000 --- a/branches/sage/pgs/msg/Message.cc +++ /dev/null @@ -1,345 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" -#include "messages/MMonPaxos.h" - -#include "messages/MMonElection.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" -//#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDirWarningAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchor.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "messenger: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(msg_envelope_t& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - case MSG_MON_COMMAND: - m = new MMonCommand; - break; - case MSG_MON_COMMAND_ACK: - m = new MMonCommandAck; - break; - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION: - m = new MMonElection; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - - // clients - case MSG_CLIENT_MOUNT: - m = new MClientMount; - break; - case MSG_CLIENT_UNMOUNT: - m = new MClientUnmount; - break; - case MSG_CLIENT_SESSION: - m = new MClientSession; - break; - case MSG_CLIENT_RECONNECT: - m = new MClientReconnect; - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest; - break; - case MSG_CLIENT_REQUEST_FORWARD: - m = new MClientRequestForward; - break; - case MSG_CLIENT_REPLY: - m = new MClientReply; - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps; - break; - - // mds - case MSG_MDS_SLAVE_REQUEST: - m = new MMDSSlaveRequest; - break; - - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_RESOLVE: - m = new MMDSResolve; - break; - case MSG_MDS_RESOLVEACK: - m = new MMDSResolveAck; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - /* - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - */ - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - case MSG_MDS_EXPORTDIRCANCEL: - m = new MExportDirCancel(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir; - break; - case MSG_MDS_EXPORTDIRACK: - m = new MExportDirAck; - break; - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish; - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning; - break; - case MSG_MDS_EXPORTDIRWARNINGACK: - m = new MExportDirWarningAck; - break; - - - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHOR: - m = new MAnchor(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_SHUTDOWN: - case MSG_MDS_SHUTDOWNSTART: - case MSG_MDS_SHUTDOWNFINISH: - case MSG_OSD_MKFS_ACK: - m = new MGenericMessage(env.type); - break; - - default: - dout(1) << "can't decode unknown message type " << env.type << endl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/branches/sage/pgs/msg/Message.h b/branches/sage/pgs/msg/Message.h deleted file mode 100644 index 58f5da03f3943..0000000000000 --- a/branches/sage/pgs/msg/Message.h +++ /dev/null @@ -1,259 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_SHUTDOWN 99999 - -#define MSG_MON_COMMAND 13 -#define MSG_MON_COMMAND_ACK 14 - - -#define MSG_MON_ELECTION 15 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 -#define MSG_OSD_MKFS_ACK 46 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 - -// -- client -- -// to monitor -#define MSG_CLIENT_MOUNT 60 -#define MSG_CLIENT_UNMOUNT 61 - -// to mds -#define MSG_CLIENT_SESSION 70 // start or stop -#define MSG_CLIENT_RECONNECT 71 - -#define MSG_CLIENT_REQUEST 80 -#define MSG_CLIENT_REQUEST_FORWARD 81 -#define MSG_CLIENT_REPLY 82 -#define MSG_CLIENT_FILECAPS 83 - - - -// *** MDS *** - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_HEARTBEAT 104 // for mds load balancer -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_RESOLVE 106 -#define MSG_MDS_RESOLVEACK 107 - -#define MSG_MDS_CACHEREJOIN 108 - -#define MSG_MDS_DISCOVER 110 -#define MSG_MDS_DISCOVERREPLY 111 - -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_INODEFILECAPS 115 - -#define MSG_MDS_INODEUPDATE 120 -#define MSG_MDS_DIRUPDATE 121 -#define MSG_MDS_INODEEXPIRE 122 -#define MSG_MDS_DIREXPIRE 123 - -#define MSG_MDS_DIREXPIREREQ 124 - -#define MSG_MDS_CACHEEXPIRE 125 - -#define MSG_MDS_ANCHOR 130 - -#define MSG_MDS_EXPORTDIRDISCOVER 149 -#define MSG_MDS_EXPORTDIRDISCOVERACK 150 -#define MSG_MDS_EXPORTDIRCANCEL 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIRWARNINGACK 155 -#define MSG_MDS_EXPORTDIR 156 -#define MSG_MDS_EXPORTDIRACK 157 -#define MSG_MDS_EXPORTDIRNOTIFY 158 -#define MSG_MDS_EXPORTDIRNOTIFYACK 159 -#define MSG_MDS_EXPORTDIRFINISH 160 - -#define MSG_MDS_SLAVE_REQUEST 170 - -#define MSG_MDS_DENTRYUNLINK 200 - -#define MSG_MDS_LOCK 500 - -#define MSG_MDS_SHUTDOWNSTART 900 -#define MSG_MDS_SHUTDOWNFINISH 901 - - -#include -#include - -#include -#include -using std::list; - -#include - - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - - -typedef struct { - int type; - entity_inst_t src, dst; - int source_port, dest_port; - int nchunks; -} msg_envelope_t; - -#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) - - -class Message { - private: - - protected: - msg_envelope_t env; // envelope - bufferlist payload; // payload - - friend class Messenger; -public: - - public: - Message() { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - }; - Message(int t) { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - // for rpc-type procedural messages (pcid = procedure call id) - virtual long get_pcid() { return 0; } - virtual void set_pcid(long t) { assert(0); } // overload me - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - void copy_payload(bufferlist& bl) { - payload = bl; - } - msg_envelope_t& get_envelope() { - return env; - } - void set_envelope(msg_envelope_t& env) { - this->env = env; - } - - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = inst; } - - entity_inst_t& get_source_inst() { return env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = inst; } - - entity_name_t& get_dest() { return env.dst.name; } - void set_dest(entity_name_t a, int p) { env.dst.name = a; env.dest_port = p; } - int get_dest_port() { return env.dest_port; } - void set_dest_port(int p) { env.dest_port = p; } - - entity_name_t& get_source() { return env.src.name; } - void set_source(entity_name_t a, int p) { env.src.name = a; env.source_port = p; } - int get_source_port() { return env.source_port; } - - entity_addr_t& get_source_addr() { return env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - virtual void decode_payload() = 0; - virtual void encode_payload() = 0; - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(msg_envelope_t &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/branches/sage/pgs/msg/Messenger.cc b/branches/sage/pgs/msg/Messenger.cc deleted file mode 100644 index 5af83462b2995..0000000000000 --- a/branches/sage/pgs/msg/Messenger.cc +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/branches/sage/pgs/msg/Messenger.h b/branches/sage/pgs/msg/Messenger.h deleted file mode 100644 index be60c5061b086..0000000000000 --- a/branches/sage/pgs/msg/Messenger.h +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - entity_name_t _myname; - - public: - Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myname; } - void _set_myname(entity_name_t m) { _myname = m; } - - virtual void reset_myname(entity_name_t m) = 0; - - virtual const entity_addr_t &get_myaddr() = 0; - - entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - virtual void suicide() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0) = 0; - - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/branches/sage/pgs/msg/RWLock.h b/branches/sage/pgs/msg/RWLock.h deleted file mode 100644 index 14e158a64ab97..0000000000000 --- a/branches/sage/pgs/msg/RWLock.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/pgs/msg/SerialMessenger.h b/branches/sage/pgs/msg/SerialMessenger.h deleted file mode 100644 index c17553e2fb88d..0000000000000 --- a/branches/sage/pgs/msg/SerialMessenger.h +++ /dev/null @@ -1,29 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SERIAL_MESSENGER_H -#define __SERIAL_MESSENGER_H - -#include "Dispatcher.h" -#include "Message.h" - -class SerialMessenger : public Dispatcher { - public: - virtual void dispatch(Message *m) = 0; // i receive my messages here - virtual void send(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // doesn't block - virtual Message *sendrecv(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply -}; - -#endif diff --git a/branches/sage/pgs/msg/SimpleMessenger.cc b/branches/sage/pgs/msg/SimpleMessenger.cc deleted file mode 100644 index 48fba9791ee7a..0000000000000 --- a/branches/sage/pgs/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1221 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - -sighandler_t old_sigint_handler; - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); - old_sigint_handler(r); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << endl; - - // force close listener socket - ::close(accepter.listen_sd); - - // force close all pipe sockets, too - for (hash_map::iterator p = rank_pipe.begin(); - p != rank_pipe.end(); - ++p) - p->second->force_close(); - - lock.Unlock(); -} - - - - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - // use whatever user specified.. - g_my_addr.make_addr(rank.listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &rank.listen_addr, sizeof(rank.listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << rank.listen_addr << endl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(rank.listen_addr); - getsockname(listen_sd, (sockaddr*)&rank.listen_addr, &llen); - - dout(10) << "accepter.start bound to " << rank.listen_addr << endl; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - // my address is... HELP HELP HELP! - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - // figure out my_addr - if (g_my_addr.port > 0) { - // user specified it, easy peasy. - rank.my_addr = g_my_addr; - } else { - // look up my hostname. blech! this sucks. - rank.listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &rank.listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - - // set up my_addr with a nonce - rank.my_addr.set_addr(rank.listen_addr); - rank.my_addr.nonce = getpid(); // FIXME: pid might not be best choice here. - } - - dout(10) << "accepter.start my addr is " << rank.my_addr << endl; - - // set up signal handler - old_sigint_handler = signal(SIGINT, simplemessenger_sigint); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - if (!rank.local.empty()) { - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // low ranks' Pipes "win" - if (peer_addr < rank.my_addr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_addr]->close(); // close old one - rank.rank_pipe[peer_addr] = this; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - tcpaddr_t tcpaddr; - peer_addr.make_addr(tcpaddr); - rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << endl; - return rc; - } - - // identify peer ..... FIXME - entity_addr_t paddr; - rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << endl; - return -1; - } - if (peer_addr != paddr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect peer identifies itself as " << paddr << ", wrong guy!" << endl; - ::close(sd); - sd = 0; - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_addr); - } - } - rank.lock.Unlock(); - - // queue close message? - if (!need_to_send_close) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close already closing/closed" << endl; - return; - } - - if (!writer_running) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close not queueing MSG_CLOSE, no writer running" << endl; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - need_to_send_close = false; - lock.Unlock(); - } -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << endl; - need_to_send_close = false; - } else { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << endl; - } - - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - entity = rank.find_unnamed(m->get_dest()); - if (!entity) { - if (rank.stopped.count(m->get_dest())) { - // ignore it - } else { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " - << errno << ": " << strerror(errno) - << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_addr(rank.my_addr); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() - << ", " << errno << ": " << strerror(errno) - << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - need_to_send_close = false; - return 0; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - need_to_send_close = false; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << endl; - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) - rank.rank_pipe.erase(peer_addr); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - EntityMessenger *mgr = rank.local[q.front()->get_source()]; - Dispatcher *dis = mgr->get_dispatcher(); - if (mgr->is_stopped()) { - // ignore. - dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl; - delete q.front(); - } else { - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - } - else { - // oh well. sending entity musta just shut down? - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << endl; - if (i->first) - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank() : - single_dispatcher(this), - started(false) { - // default to any listen_addr - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; -} -Rank::~Rank() -{ -} - -/* -void Rank::set_listen_addr(tcpaddr_t& a) -{ - dout(10) << "set_listen_addr " << a << endl; - memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); - listen_addr.sin_port = a.sin_port; -} -*/ - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_addr() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_addr() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - lock.Lock(); - if (started) { - dout(10) << "start_rank already started" << endl; - lock.Unlock(); - return 0; - } - dout(10) << "start_rank" << endl; - lock.Unlock(); - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - dout(1) << "start_rank at " << listen_addr << endl; - started = true; - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.my_addr); - - dout(10) << "connect_rank to " << addr << endl; - - // create pipe - Pipe *pipe = new Pipe(addr); - rank.rank_pipe[addr] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - - -Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) -{ - // find an unnamed local entity of the right type - for (map::iterator p = local.begin(); - p != local.end(); - ++p) { - if (p->first.type() == a.type() && p->first.is_new()) - return p->second; - } - return 0; -} - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << endl; - lock.Lock(); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(name); - - // add to directory - assert(local.count(name) == 0); - local[name] = msgr; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << endl; - - // remove from local directory. - entity_name_t name = msgr->get_myname(); - assert(local.count(name)); - local.erase(name); - - stopped.insert(name); - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_addr == my_addr) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - } - } - else { - // remote. - if (rank_pipe.count( dest_addr )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_addr ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_addr ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.empty()) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << endl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - //dout(10) << "wait: stopping accepter thread" << endl; - //accepter.stop(); - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; - dout(1) << "shutdown complete." << endl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << endl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myname())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myname()]); - rank.waiting_for_ready.erase(get_myname()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(10) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(10) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - -void Rank::EntityMessenger::suicide() -{ - dout(10) << "suicide " << get_myaddr() << endl; - shutdown(); - // hmm, or exit(0)? -} - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, - int port, int fromport) -{ - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << endl; - - rank.submit_message(m, dest.addr); - - return 0; -} - - - -const entity_addr_t &Rank::EntityMessenger::get_myaddr() -{ - return rank.my_addr; -} - - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - rank.lock.Lock(); - { - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << endl; - - rank.local.erase(oldname); - rank.local[newname] = this; - - _set_myname(newname); - } - rank.lock.Unlock(); -} - - - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - /* - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst)) { - rank_pipe[inst]->close(); - rank_pipe.erase(inst); - } - } - } - */ - lock.Unlock(); -} - - diff --git a/branches/sage/pgs/msg/SimpleMessenger.h b/branches/sage/pgs/msg/SimpleMessenger.h deleted file mode 100644 index 0f49ce6a88824..0000000000000 --- a/branches/sage/pgs/msg/SimpleMessenger.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - protected: - int sd; - bool done; - entity_addr_t peer_addr; - bool server; - bool need_to_send_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_addr_t &pi) : sd(0), - done(false), peer_addr(pi), server(false), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void close(); - void join() { - if (writer_thread.is_started()) writer_thread.join(); - if (reader_thread.is_started()) reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - void force_close() { - ::close(sd); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - const entity_addr_t &get_myaddr(); - - void reset_myname(entity_name_t m); - - int shutdown(); - void suicide(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - bool started; - - // where i listen - tcpaddr_t listen_addr; - entity_addr_t my_addr; - - // local - map local; - set stopped; - //hash_set entity_unstarted; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - void mark_down(entity_addr_t addr); - //void mark_up(entity_name_t addr, entity_addr_t& i); - - tcpaddr_t get_listen_addr() { return listen_addr; } - - void reaper(); - - EntityMessenger *find_unnamed(entity_name_t a); - -public: - Rank(); - ~Rank(); - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/sage/pgs/msg/mpistarter.cc b/branches/sage/pgs/msg/mpistarter.cc deleted file mode 100644 index 685c104d8d92d..0000000000000 --- a/branches/sage/pgs/msg/mpistarter.cc +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "TCPMessenger.h" - -/* - * start up TCPMessenger via MPI. - */ - -pair mpi_bootstrap_tcp(int& argc, char**& argv) -{ - tcpmessenger_init(); - tcpmessenger_start(); - - // exchnage addresses with other nodes - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl; - - // start up directory? - tcpaddr_t ta; - if (mpi_rank == 0) { - dout(30) << "i am rank 0, starting ns directory" << endl; - tcpmessenger_start_nameserver(ta); - } else { - memset(&ta, 0, sizeof(ta)); - } - - // distribute tcpaddr - int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl; - tcpmessenger_start_rankserver(ta); - - MPI_Barrier(MPI_COMM_WORLD); - //g_clock.tare(); - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} - - diff --git a/branches/sage/pgs/msg/msg_types.h b/branches/sage/pgs/msg/msg_types.h deleted file mode 100644 index f20ffe8ed3720..0000000000000 --- a/branches/sage/pgs/msg/msg_types.h +++ /dev/null @@ -1,191 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "include/blobhash.h" -#include "tcp.h" - -// new typed msg_addr_t way! -class entity_name_t { - int _type; - int _num; - -public: - static const int TYPE_MON = 1; - static const int TYPE_MDS = 2; - static const int TYPE_OSD = 3; - static const int TYPE_CLIENT = 4; - static const int TYPE_ADMIN = 5; - - static const int NEW = -1; - - // cons - entity_name_t() : _type(0), _num(0) {} - entity_name_t(int t, int n=NEW) : _type(t), _num(n) {} - - int num() const { return _num; } - int type() const { return _type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - case TYPE_ADMIN: return "admin"; - default: return "unknown"; - } - } - - bool is_new() const { return num() == NEW; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } - bool is_admin() const { return type() == TYPE_ADMIN; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - -// get rid of these -#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) -#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) -#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) -#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) - -#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) -#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) -#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) -#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - uint8_t ipq[4]; - uint32_t port; - uint32_t nonce; // bind time, or pid, or something unique! - - entity_addr_t() : port(0), nonce(0) { - ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); - port = ntohs(a.sin_port); - } - void make_addr(tcpaddr_t& a) const { - memset(&a, 0, sizeof(a)); - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); - a.sin_port = htons(port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.ipq[0] - << '.' << (int)addr.ipq[1] - << '.' << (int)addr.ipq[2] - << '.' << (int)addr.ipq[3] - << ':' << addr.port - << '.' << addr.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} - - -#endif diff --git a/branches/sage/pgs/msg/new_mpistarter.cc b/branches/sage/pgs/msg/new_mpistarter.cc deleted file mode 100644 index 72adcf90b5265..0000000000000 --- a/branches/sage/pgs/msg/new_mpistarter.cc +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#include -#include "NewMessenger.h" - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - tcpaddr_t nsaddr; - memset(&nsaddr, 0, sizeof(nsaddr)); - - if (mpi_rank == 0) { - // i am root. - rank.my_rank = 0; - rank.start_rank(nsaddr); - nsaddr = rank.get_listen_addr(); - } - - int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl; - - if (mpi_rank != 0) { - rank.start_rank(nsaddr); - } - - MPI_Barrier(MPI_COMM_WORLD); - - //g_clock.tare(); - - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} diff --git a/branches/sage/pgs/msg/tcp.cc b/branches/sage/pgs/msg/tcp.cc deleted file mode 100644 index 232ee03fa5d09..0000000000000 --- a/branches/sage/pgs/msg/tcp.cc +++ /dev/null @@ -1,89 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "tcp.h" - -/****************** - * tcp crap - */ - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - dout(18) << "tcp_read socket " << sd << " closed" << endl; - return false; - } - if (got < 0) { - dout(18) << "tcp_read bailing with " << got << endl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //dout(DBL) << "tcp_write writing " << len << endl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl; - } - return 0; -} - - -int tcp_hostlookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} diff --git a/branches/sage/pgs/msg/tcp.h b/branches/sage/pgs/msg/tcp.h deleted file mode 100644 index 7a866af7f9d86..0000000000000 --- a/branches/sage/pgs/msg/tcp.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -typedef struct sockaddr_in tcpaddr_t; - -using std::ostream; - -inline ostream& operator<<(ostream& out, const tcpaddr_t &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -extern bool tcp_read(int sd, char *buf, int len); -extern int tcp_write(int sd, char *buf, int len); -extern int tcp_hostlookup(char *str, tcpaddr_t& ta); - -inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/branches/sage/pgs/newsyn.cc b/branches/sage/pgs/newsyn.cc deleted file mode 100644 index 5eadd85bfdb6f..0000000000000 --- a/branches/sage/pgs/newsyn.cc +++ /dev/null @@ -1,433 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define intabs(x) ((x) >= 0 ? (x):(-(x))) - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - - -/* - * start up NewMessenger via MPI. - */ -#include - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.my_addr; - moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i 0 ? g_conf.num_mon:0; - int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0; - int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0; - int start_client = g_conf.num_client > 0 ? g_conf.num_client:0; - - //g_conf.num_mon = intabs(g_conf.num_mon); - g_conf.num_mds = intabs(g_conf.num_mds); - g_conf.num_client = intabs(g_conf.num_client); - g_conf.num_osd = intabs(g_conf.num_osd); - - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += start_mds; - if (g_conf.ms_stripe_osds) - need++; - else - need += start_osd; - if (start_client) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // create mon - if (myrank < g_conf.num_mon) { - Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); - mon->init(); - } - - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap); - mdsosd[i]->init(); - } - } - - // create osd - map osd; - int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (start_osd-1)/max_osd_nodes + 1; - for (int i=0; iinit(); - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = start_osd; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1; - set clientlist; - map client;//[start_client]; - map syn;//[start_client]; - int nclients = 0; - for (int i=0; iinit(); - started++; - - syn[i] = new SyntheticClient(client[i]); - - client[i]->mount(); - nclients++; - } - - if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; - syn[i]->start_thread(); - - } - if (nclients) { - cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - // cout << "waiting for synthetic client" << i << " to finish" << endl; - syn[i]->join_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "client" << i << " unmounted" << endl; - client[i]->shutdown(); - - delete client[i]; - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << endl; - cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl; - //rank.stop_rank(); - } - - // wait for everything to finish - rank.wait(); - - if (started) cerr << "newsyn finishing" << endl; - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include -#include -#include "OSBDB.h" -#include "common/Timer.h" - -using namespace std; - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")@" << __LINE__ << "." -#undef derr -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")@" << __LINE__ << "." - -#define CLEANUP(onsafe) do { \ - dout(6) << "DELETE " << hex << onsafe << dec << endl; \ - delete onsafe; \ - } while (0) -#define COMMIT(onsafe) do { \ - dout(6) << "COMMIT " << hex << onsafe << dec << endl; \ - sync(onsafe); \ - } while (0) - - // Have a lock, already. - -class scoped_lock -{ -private: - Mutex *m; -public: - scoped_lock(Mutex *m) : m(m) { m->Lock(); } - ~scoped_lock() { m->Unlock(); } -}; - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -DbEnv *OSBDB::getenv () -{ - DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS); - if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1) - envp->set_error_stream (&std::cerr); - if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2) - envp->set_message_stream (&std::cout); - envp->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - //| DB_INIT_LOCK - | DB_INIT_MPOOL - //| DB_INIT_TXN - //| DB_INIT_LOG - | DB_PRIVATE); - if (envp->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - assert(0); - } - return envp; -} - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - env = getenv(); - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << endl; - - if (mounted) - { - dout(4) << "..already mounted" << endl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << endl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << endl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << endl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << endl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << endl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << endl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << endl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << endl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << endl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << endl; - - string d = env_dir; - d += device; - unlink (d.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << endl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << endl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << endl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << endl; - return -EIO; - } - dout(3) << "..wrote superblock" << endl; - dout(4) << "..mkfs done" << endl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << endl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << endl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << endl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << endl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << endl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "stat " << oid << endl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << endl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << endl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << endl; - dout(4) << "..stat OK" << endl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - dout(6) << "Context " << hex << onsafe << dec << endl; - scoped_lock __lock(&lock); - dout(2) << "remove " << oid << endl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - int ret; - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - if ((ret = db->del (txn, &ikey, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - if ((ret = db->del (txn, &akey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - if ((ret = db->del (txn, &askey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - - // XXX check del return value - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..remove OK" << endl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "truncate " << size << endl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(4) << "..returns -ENOENT" << endl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".getting old object failed" << endl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new object failed" << endl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..truncate OK" << endl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << endl; - - if (bl.length() < len) - { - int remain = len - bl.length(); - bufferptr ptr (remain); - bl.push_back(ptr); - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << "..get " << _ikey << endl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << endl; - return -ENOENT; - } - - dout(3) << "..object has size " << obj.length << endl; - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << "..doing full read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << "..getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << endl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << "..doing partial read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value; - char *data = bl.c_str(); - dout(3) << ".bufferlist c_str returned " << ((void*) data) << endl; - value.set_data (data); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << "..getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << endl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "write " << oid << " " << offset << " " - << len << endl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << endl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << endl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << endl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << endl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << endl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object info failed!" << endl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << endl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << endl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << endl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << endl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << endl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << endl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << endl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << endl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "list_collections" << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << endl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << endl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "create_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (ins < scp->count && scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " already exists " << endl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << endl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << endl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << endl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << endl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collections list failed" << endl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collection failed" << endl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..create_collection OK" << endl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "destroy_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection list doesn't exist" << endl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= scp->count || scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << endl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << endl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << endl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting modified collection list failed" << endl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".deleting collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..destroy_collection OK" << endl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << endl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << endl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << endl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << endl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << endl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << endl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << endl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << endl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - dout(2) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_add " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "failed to find collection" << endl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (ins < scp->count && scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "collection already has object" << endl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << endl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << endl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << endl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection add OK" << endl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_remove " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection doesn't exist" << endl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << endl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection is empty" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= scp->count || scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..object not in collection" << endl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << endl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << endl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection remove OK" << endl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long: " << name << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - scoped_lock __lock(&lock); - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << endl; - if (sap->count == 0 || - (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(5) << "..attributes list: " << sap << endl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - dout(4) << "..setattr OK" << endl; - if (onsafe != NULL) - COMMIT(onsafe); - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << endl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << endl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - int ret; - if ((ret = db->get (NULL, &key, &val, 0)) != 0) - { - derr(1) << ".getting value failed: " << db_strerror (ret) << endl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "rmattr " << oid << " " << name << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attribute list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not found in list" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..rmattr OK" << endl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << endl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << endl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << endl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new attributes failed" << endl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(3) << "..attributes list: " << sap << endl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting attribute failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection setattr OK" << endl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << endl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".no attributes list" << endl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attributes list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not listed" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection rmattr OK" << endl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << endl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << endl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << endl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - - if (onsync != NULL) - { - g_timer.add_event_after(0.1, onsync); - } -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - if (transactional) - { - env->log_flush (NULL); - env->lsn_reset (device.c_str(), 0); - } - db->sync(0); -} diff --git a/branches/sage/pgs/osbdb/OSBDB.h b/branches/sage/pgs/osbdb/OSBDB.h deleted file mode 100644 index 8eb2004d3903f..0000000000000 --- a/branches/sage/pgs/osbdb/OSBDB.h +++ /dev/null @@ -1,482 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - Mutex lock; - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : lock(true), env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); - DbEnv *getenv(); -}; diff --git a/branches/sage/pgs/osd/Ager.cc b/branches/sage/pgs/osd/Ager.cc deleted file mode 100644 index 82f035c04d1da..0000000000000 --- a/branches/sage/pgs/osd/Ager.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl; - if (free >= pc) { - dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl; - if (free <= pc) { - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(2) << "age_empty sync" << endl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl; - uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << endl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - dout(1) << "#age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - dout(1) << "age finished" << endl; -} - - -void Ager::load_freelist() -{ - dout(1) << "load_freelist" << endl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - dout(1) << "save_freelist " << el << endl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/branches/sage/pgs/osd/Ager.h b/branches/sage/pgs/osd/Ager.h deleted file mode 100644 index ad160c0e9f9ff..0000000000000 --- a/branches/sage/pgs/osd/Ager.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/branches/sage/pgs/osd/BDBMap.h b/branches/sage/pgs/osd/BDBMap.h deleted file mode 100644 index a8e96a8a192f7..0000000000000 --- a/branches/sage/pgs/osd/BDBMap.h +++ /dev/null @@ -1,137 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/branches/sage/pgs/osd/Fake.h b/branches/sage/pgs/osd/Fake.h deleted file mode 100644 index 2155c46de3673..0000000000000 --- a/branches/sage/pgs/osd/Fake.h +++ /dev/null @@ -1,250 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/branches/sage/pgs/osd/FakeStore.cc b/branches/sage/pgs/osd/FakeStore.cc deleted file mode 100644 index 2ee0201bc9d7e..0000000000000 --- a/branches/sage/pgs/osd/FakeStore.cc +++ /dev/null @@ -1,644 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << g_clock.now() << " osd" << whoami << ".fakestore " -#define derr(l) if (l<=g_conf.debug) cerr << g_clock.now() << " osd" << whoami << ".fakestore " - -#include "include/buffer.h" - -#include -#include -using namespace __gnu_cxx; - -// crap-a-crap hash -#define HASH_DIRS 0x80 -#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << endl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << endl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << endl; - system(cmd); - - // hashed bits too - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) -{ - dout(20) << "write " << oid << " len " << len << " off " << offset << endl; - - char fn[200]; - get_oname(oid,fn); - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << endl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - char fn[100]; - get_oname(oid, fn); - int r = ::setxattr(fn, name, value, size, 0); - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - char fn[100]; - get_oname(oid, fn); - int r = ::getxattr(fn, name, value, size); - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - coll_t c = strtoll(de->d_name, 0, 16); - dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - object_t o; - assert(sizeof(o) == 16); - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/branches/sage/pgs/osd/FakeStore.h b/branches/sage/pgs/osd/FakeStore.h deleted file mode 100644 index e88c205315bc0..0000000000000 --- a/branches/sage/pgs/osd/FakeStore.h +++ /dev/null @@ -1,111 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - int whoami; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base, int w) : - basedir(base), - whoami(w), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - -}; - -#endif diff --git a/branches/sage/pgs/osd/FakeStoreBDBCollections.h b/branches/sage/pgs/osd/FakeStoreBDBCollections.h deleted file mode 100644 index a779a2a57972c..0000000000000 --- a/branches/sage/pgs/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/branches/sage/pgs/osd/OBFSStore.cc b/branches/sage/pgs/osd/OBFSStore.cc deleted file mode 100644 index e679c0aedf611..0000000000000 --- a/branches/sage/pgs/osd/OBFSStore.cc +++ /dev/null @@ -1,245 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "OBFSStore.h" - -extern "C" { -#include "../../uofs/uofs.h" -} - -#include "common/Timer.h" - -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs " - -OBFSStore::OBFSStore(int whoami, char *param, char *dev) -{ - this->whoami = whoami; - this->mounted = -1; - this->bdev_id = -1; - this->param[0] = 0; - this->dev[0] = 0; - if (dev) - strcpy(this->dev, dev); - if (param) - strcpy(this->param, param); -} - -int OBFSStore::mount(void) -{ - dout(0) << "OBFS init!" << endl; - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl; - return -1; - } - - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - switch (this->mounted) { - case -1: - this->mkfs(); - //retry to mount - dout(0) << "remount the OBFS" << endl; - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case -2: - //fsck - dout(0) << "Need fsck! Simply formatted for now!" << endl; - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case 0: - //success - break; - default: - break; - } - - if (this->mounted >= 0) - dout(0) << "successfully mounted!" << endl; - else - dout(0) << "error in mounting obfsstore!" << endl; - - return 0; -} - -int OBFSStore::mkfs(void) -{ - /*int donode_size_byte = 1024, - bd_ratio = 10, - reg_size_mb = 256, - sb_size_kb = 4, - lb_size_kb = 1024, - nr_hash_table_buckets = 1023, - delay_allocation = 1, - flush_interval = 5; - FILE *param; - */ - - - if (this->mounted >= 0) - return 0; - - dout(0) << "OBFS.mkfs!" << endl; - /* - if (strlen(this->param) > 0) { - param = fopen(this->param, "r"); - if (param) { - //fscanf(param, "Block Device: %s\n", this->dev); - fscanf(param, "Donode Size: %d\n", &donode_size_byte); - fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio); - fscanf(param, "Region Size: %d MB\n", ®_size_mb); - fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb); - fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb); - fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets); - fscanf(param, "Delayed Allocation: %d\n", &delay_allocation); - } else { - dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl; - dout(0) << "use default parameters" << endl; - } - } else - dout(0) << "use default parameters" << endl; - */ - - if (this->bdev_id <= 0) - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl; - return -1; - } - - dout(0) << "start formating!" << endl; - - uofs_format(this->bdev_id, - g_conf.uofs_onode_size, - g_conf.uofs_block_meta_ratio, - g_conf.uofs_segment_size, - g_conf.uofs_small_block_size, - g_conf.uofs_large_block_size, - g_conf.uofs_nr_hash_buckets, - g_conf.uofs_delay_allocation, - 0,//g_conf.uofs_dev_force_size, - g_conf.uofs_flush_interval, - 0); - - dout(0) << "formatting complete!" << endl; - return 0; -} - -int OBFSStore::umount(void) -{ - uofs_shutdown(); - close(this->bdev_id); - - return 0; -} - -int OBFSStore::statfs(struct statfs *sfs) -{ - return 0; -} - -bool OBFSStore::exists(object_t oid) -{ - //dout(0) << "calling function exists!" << endl; - return uofs_exist(oid); -} - -int OBFSStore::stat(object_t oid, struct stat *st) -{ - dout(0) << "calling function stat!" << endl; - if (uofs_exist(oid)) return 0; - return -1; -} - -int OBFSStore::remove(object_t oid) -{ - dout(0) << "calling remove function!" << endl; - return uofs_del(oid); -} - -int OBFSStore::truncate(object_t oid, off_t size) -{ - dout(0) << "calling truncate function!" << endl; - //return uofs_truncate(oid, size); - return -1; -} - -int OBFSStore::read(object_t oid, size_t len, - off_t offset, bufferlist &bl) -{ - //dout(0) << "calling read function!" << endl; - //dout(0) << oid << " 0 " << len << " " << offset << " 100" << endl; - - // FIXME: page-align this and we can avoid a memcpy... - bl.push_back(new buffer(len)); - return uofs_read(oid, bl.c_str(), offset, len); -} - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, bool fsync) -{ - int ret = 0; - - //dout(0) << "calling write function!" << endl; - //if (whoami == 0) - // dout(0) << oid << " 0 " << len << " " << offset << " 101" << endl; - - for (list::iterator p = bl.buffers().begin(); - p != bl.buffers().end(); - p++) { - ret += uofs_write(oid, (*p).c_str(), offset, len, 0); - } - - if (fsync) - ret += uofs_sync(oid); - - return ret; -} - - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, Context *onflush) -{ - int r = write(oid, len, offset, bl, false); - g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush); - return r; -} diff --git a/branches/sage/pgs/osd/OBFSStore.h b/branches/sage/pgs/osd/OBFSStore.h deleted file mode 100644 index aff7e96e7245b..0000000000000 --- a/branches/sage/pgs/osd/OBFSStore.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _OBFSSTORE_H_ -#define _OBFSSTORE_H_ - -#include "ObjectStore.h" -#include "Fake.h" - -class OBFSStore : public ObjectStore, - public FakeStoreAttrs, - public FakeStoreCollections { - int whoami; - int bdev_id; - int mounted; - char dev[128]; - char param[128]; - - public: - OBFSStore(int whoami, char *param, char *dev); - - int mount(void); - int umount(void); - int mkfs(void); - - int statfs(struct statfs *); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid); - int truncate(object_t oid, off_t size); - - int read(object_t oid, size_t len, - off_t offset, bufferlist& bl); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - bool fsync); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - Context *onflush); - -}; - -#endif diff --git a/branches/sage/pgs/osd/OSD.cc b/branches/sage/pgs/osd/OSD.cc deleted file mode 100644 index 94b3e63f2d8f4..0000000000000 --- a/branches/sage/pgs/osd/OSD.cc +++ /dev/null @@ -1,2276 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#ifdef USE_OBFS -# include "OBFSStore.h" -#else -# include "FakeStore.h" -#endif - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - - -#include "ReplicatedPG.h" -#include "RAID4PG.h" - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - -object_t SUPERBLOCK_OBJECT(0,0); - - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << dendl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << dendl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : - timer(osd_lock), - load_calc(g_conf.osd_max_opq<1?1:g_conf.osd_max_opq), - iat_averager(g_conf.osd_flash_crowd_iat_alpha) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - hb_stat_ops = 0; - hb_stat_qlen = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OBFS - else if (g_conf.uofs) { - store = new OBFSStore(whoami, NULL, dev_path); - } -#endif -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path, whoami); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - osd_lock.Lock(); - { - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs" << dendl; - store->mkfs(); - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << dendl; - int r = store->mount(); - assert(r>=0); - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << dendl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - } - else { - dout(2) << "boot" << dendl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << dendl; - assert(whoami == superblock.whoami); - } - - - // log - char name[80]; - sprintf(name, "osd%02d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(messenger->get_myinst(), superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - } - osd_lock.Unlock(); - - //dout(0) << "osd_rep " << g_conf.osd_rep << dendl; - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << dendl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << dendl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - - - - -// ====================================================== -// PG's - -PG *OSD::_create_lock_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - dout(10) << "_create_lock_pg " << pgid << dendl; - - if (pg_map.count(pgid)) - dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl; - - // create - PG *pg; - if (pgid.is_rep()) - pg = new ReplicatedPG(this, pgid); - else if (pgid.is_raid4()) - pg = new RAID4PG(this, pgid); - else - assert(0); - - assert(pg_map.count(pgid) == 0); - pg_map[pgid] = pg; - - // lock - pg->lock(); - pg_lock.insert(pgid); - - pg->get(); // because it's in pg_map - pg->get(); // because we're locking it - - // create collection - assert(!store->collection_exists(pgid)); - t.create_collection(pgid); - - return pg; -} - -bool OSD::_have_pg(pg_t pgid) -{ - return pg_map.count(pgid); -} - -PG *OSD::_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - - // wait? - if (pg_lock.count(pgid)) { - Cond c; - dout(15) << "lock_pg " << pgid << " waiting as " << &c << dendl; - //cerr << "lock_pg " << pgid << " waiting as " << &c << dendl; - - list& ls = pg_lock_waiters[pgid]; // this is commit, right? - ls.push_back(&c); - - while (pg_lock.count(pgid) || - ls.front() != &c) - c.Wait(osd_lock); - - assert(ls.front() == &c); - ls.pop_front(); - if (ls.empty()) - pg_lock_waiters.erase(pgid); - } - - dout(15) << "lock_pg " << pgid << dendl; - pg_lock.insert(pgid); - - PG *pg = pg_map[pgid]; - pg->lock(); - pg->get(); // because we're "locking" it and returning a pointer copy. - return pg; -} - -void OSD::_unlock_pg(pg_t pgid) -{ - // unlock - assert(pg_lock.count(pgid)); - pg_lock.erase(pgid); - - pg_map[pgid]->put_unlock(); - - if (pg_lock_waiters.count(pgid)) { - // someone is in line - Cond *c = pg_lock_waiters[pgid].front(); - assert(c); - dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << dendl; - c->Signal(); - } else { - // nobody waiting - dout(15) << "unlock_pg " << pgid << dendl; - } -} - -void OSD::_remove_unlock_pg(PG *pg) -{ - pg_t pgid = pg->info.pgid; - - dout(10) << "_remove_unlock_pg " << pgid << dendl; - - // there shouldn't be any waiters, since we're a stray, and pg is presumably clean0. - assert(pg_lock_waiters.count(pgid) == 0); - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // mark deleted - pg->mark_deleted(); - - // unlock - pg_lock.erase(pgid); - pg->put(); - - // remove from map - pg_map.erase(pgid); - pg->put_unlock(); // will delete, if last reference -} - - - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << dendl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - - PG *pg = 0; - if (pgid.is_rep()) - new ReplicatedPG(this, pgid); - else if (pgid.is_raid4()) - new RAID4PG(this, pgid); - else - assert(0); - pg_map[pgid] = pg; - pg->get(); - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl; - } -} - - - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << dendl; - - vector last; - osdmap->pg_to_acting_osds(pgid, last); - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e <= h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << dendl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e <= h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e <= h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << dendl; -} - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - _unlock_pg(pgid); - } - } - - // finishers? - finished_lock.Lock(); - if (finished.empty()) { - finished_lock.Unlock(); - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // calc my stats - float avg_qlen = 0; - if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops; - - double read_mean_time = load_calc.get_average(); - - dout(5) << "heartbeat " << now - << ": ops " << hb_stat_ops - << ", avg qlen " << avg_qlen - << ", mean read time " << read_mean_time - << dendl; - - // reset until next time around - hb_stat_ops = 0; - hb_stat_qlen = 0; - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - messenger->send_message(new MOSDPing(osdmap->get_epoch(), - avg_qlen, - read_mean_time ), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) - peer_map_epoch[inst.name] = epoch; - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << dendl; - delete m; - break; - - // -- don't need OSDMap -- - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << dendl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << dendl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << dendl; - assert(0); - } - } - } - - // finishers? - finished_lock.Lock(); - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - return; - } - - finished_lock.Unlock(); - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl; - exit(0); - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(0) << "ms_handle_failure " << inst - << ", dropping " << *m << dendl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << dendl; - - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - peer_qlen[from] = m->avg_qlen; - peer_read_time[from] = m->read_mean_time; - - //if (!m->ack) - //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), - //m->get_source()); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << dendl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << dendl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << dendl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - if (m->get_source().is_mon() && is_booting()) - advanced = true; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - bufferlist bl; - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) { - dout(10) << " using provided inc map" << endl; - bl = m->incremental_maps[cur+1]; - } else { - dout(10) << " using my locally stored inc map" << endl; - get_inc_map_bl(cur+1, bl); - } - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.addr); - peer_map_epoch.erase(MSG_ADDR_OSD(osd)); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - _lock_pg(pg->info.pgid); - pg->note_failed_osd(osd); - _unlock_pg(pg->info.pgid); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << dendl; - - if (osdmap->is_mkfs()) { - ps_t numps = osdmap->get_pg_num(); - ps_t numlps = osdmap->get_localized_pg_num(); - dout(1) << "mkfs on " << numps << " normal, " << numlps << " localized pg sets" << dendl; - int minrep = 1; - int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); - int minraid = g_conf.osd_min_raid_width; - int maxraid = g_conf.osd_max_raid_width; - dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " - << minraid << ".." << maxraid << " osd raid groups" << dendl; - - //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - // replicated - for (int nrep = 1; - nrep <= maxrep; // for low osd counts.. hackish bleh - nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) { - vector acting; - pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, -1); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) continue; - - PG *pg = _create_lock_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - _unlock_pg(pgid); - } - - for (ps_t ps = 0; ps < numlps; ++ps) { - // local PG too - vector acting; - pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, whoami); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG *pg = _create_lock_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = - pg->info.history.same_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - _unlock_pg(pgid); - } - } - - // raided - for (int size = minraid; - size <= maxraid; - size++) { - for (ps_t ps = 0; ps < numps; ++ps) { - vector acting; - pg_t pgid = pg_t(pg_t::TYPE_RAID4, size, ps, -1); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) continue; - - PG *pg = _create_lock_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - _unlock_pg(pgid); - } - - for (ps_t ps = 0; ps < numlps; ++ps) { - // local PG too - vector acting; - pg_t pgid = pg_t(pg_t::TYPE_RAID4, size, ps, whoami); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG *pg = _create_lock_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = - pg->info.history.same_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - _unlock_pg(pgid); - } - } - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - _lock_pg(pgid); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - pg->on_acker_change(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - pg->on_role_change(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << dendl; - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << dendl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << dendl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << dendl; - } - } - } - - - _unlock_pg(pgid); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - - } - - if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - // notify? (residual|replica) - do_notifies(notify_list); - - // do queries. - do_queries(query_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << dendl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e >= 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << dendl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << dendl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << "from pre-boot epoch " << epoch << " < " << boot_epoch << dendl; - delete m; - return false; - } - - return true; -} - - - - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << dendl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - // ok, create PG! - pg = _create_lock_pg(pgid, t); - osdmap->pg_to_acting_osds(pgid, pg->acting); - pg->set_role(0); - pg->info.history = history; - - pg->last_epoch_started_any = it->last_epoch_started; - pg->build_prior(); - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - - dout(10) << *pg << " is new" << dendl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - } else { - // already had it. am i (still) the primary? - pg = _lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << dendl; - _unlock_pg(pgid); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_clean() && acting) { - pg->clean_set.insert(from); - dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set - << "): " << *it << dendl; - if (pg->is_all_clean()) { - dout(10) << *pg << " now clean on all replicas" << dendl; - pg->state_set(PG::STATE_CLEAN); - pg->clean_replicas(); - } - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map); - } - - _unlock_pg(pgid); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - int from = m->get_source().num(); - const pg_t pgid = m->get_pgid(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - if (pg_map.count(pgid) == 0) { - dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << dendl; - assert(m->get_epoch() < osdmap->get_epoch()); - delete m; - return; - } - - PG *pg = _lock_pg(pgid); - assert(pg); - - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << "handle_pg_log " << *pg - << " from " << m->get_source() - << " is old, discarding" - << dendl; - delete m; - return; - } - - dout(7) << "handle_pg_log " << *pg - << " got " << m->log << " " << m->missing - << " from " << m->get_source() << dendl; - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(m->log, m->missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map); - do_queries(query_map); - - } else { - // i am REPLICA - dout(10) << *pg << " got " << m->log << " " << m->missing << dendl; - - // merge log - pg->merge_log(m->log, m->missing, from); - pg->proc_missing(m->log, m->missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - _unlock_pg(pgid); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - // get active crush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = _create_lock_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << dendl; - } else { - pg = _lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << dendl; - _unlock_pg(pgid); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << dendl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid()); - m->info = pg->info; - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << dendl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << dendl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << dendl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << dendl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - _unlock_pg(pgid); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << dendl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << dendl; - continue; - } - - pg = _lock_pg(pgid); - - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); - - _remove_unlock_pg(pg); - } - - delete m; -} - - - - - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - // get and lock *pg. - const pg_t pgid = op->get_pg(); - PG *pg = _have_pg(pgid) ? _lock_pg(pgid):0; - - logger->set("buf", buffer_total_alloc); - - // mark the read request received time for finding the - // read througput load. - op->set_received_time(g_clock.now()); - - // update qlen stats - hb_stat_ops++; - hb_stat_qlen += pending_ops; - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) { - if (pg) _unlock_pg(pgid); - return; - } - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << dendl; - waiting_for_pg[pgid].push_back(op); - return; - } - - // pg must be same-ish... - if (op->is_read()) { - // read - if (!pg->same_for_read_since(op->get_map_epoch())) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - _unlock_pg(pgid); - delete op; - return; - } - - /* - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << dendl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << dendl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - */ - - } else { - // modify - if ((pg->get_primary() != whoami || - !pg->same_for_modify_since(op->get_map_epoch()))) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - _unlock_pg(pgid); - delete op; - return; - } - } - - // pg must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << dendl; - pg->replay_queue[op->get_version()] = op; - _unlock_pg(pgid); - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << dendl; - } - } - - dout(7) << *pg << " not active (yet)" << dendl; - pg->waiting_for_active.push_back(op); - _unlock_pg(pgid); - return; - } - - // missing object? - if (pg->is_missing_object(op->get_oid())) { - pg->wait_for_missing_object(op->get_oid(), op); - _unlock_pg(pgid); - return; - } - - dout(10) << "handle_op " << *op << " in " << *pg << dendl; - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << dendl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (!pg->same_for_rep_modify_since(op->get_map_epoch())) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - _unlock_pg(pgid); - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << dendl; - } - - // proprocess op? - if (pg->preprocess_op(op)) { - _unlock_pg(pgid); - return; - } - - if (g_conf.osd_maxthreads < 1) { - // do it now. - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - - _unlock_pg(pgid); - } else { - // queue for worker threads - _unlock_pg(pgid); - enqueue_op(pgid, op); - } -} - - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << dendl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!_have_pg(pgid)) { - // hmm. - delete op; - return; - } - - if (g_conf.osd_maxthreads < 1) { - PG *pg = _lock_pg(pgid); - pg->do_op_reply(op); // do it now - _unlock_pg(pgid); - } else { - enqueue_op(pgid, op); // queue for worker threads - } -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(pg_t pgid, Message *op) -{ - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl; - op_queue_cond.Wait(osd_lock); - } - - op_queue[pgid].push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - threadpool->put_op(pgid); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(pg_t pgid) -{ - Message *op = 0; - PG *pg = 0; - - osd_lock.Lock(); - { - if (pgid) { - // lock pg - pg = _lock_pg(pgid); - } - - // get pending op - list &ls = op_queue[pgid]; - assert(!ls.empty()); - op = ls.front(); - ls.pop_front(); - - if (pgid) { - dout(10) << "dequeue_op " << op << " write pg " << pgid - << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl; - } else { - dout(10) << "dequeue_op " << op << " read " - << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl; - } - - if (ls.empty()) - op_queue.erase(pgid); - } - osd_lock.Unlock(); - - // do it - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); // do it now - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - - // finish - osd_lock.Lock(); - { - if (pgid) { - // unlock pg - _unlock_pg(pgid); - } - - dout(10) << "dequeue_op " << op << " finish" << dendl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << dendl; -} - - - - diff --git a/branches/sage/pgs/osd/OSD.h b/branches/sage/pgs/osd/OSD.h deleted file mode 100644 index a63c0ab7a3e0d..0000000000000 --- a/branches/sage/pgs/osd/OSD.h +++ /dev/null @@ -1,319 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - - -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class Messenger; -class Message; -class Logger; -class ObjectStore; -class OSDMap; - -class OSD : public Dispatcher { -public: - // -- states -- - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - - // load calculation - //current implementation is moving averges. - class LoadCalculator { - private: - deque m_Data ; - unsigned m_Size ; - double m_Total ; - - public: - LoadCalculator( unsigned size ) : m_Size(0), m_Total(0) { } - - void add( double element ) { - // add item - m_Data.push_back(element); - m_Total += element; - - // trim - while (m_Data.size() > m_Size) { - m_Total -= m_Data.front(); - m_Data.pop_front(); - } - } - - double get_average() { - if (m_Data.empty()) - return -1; - return m_Total / (double)m_Data.size(); - } - }; - - class IATAverager { - public: - struct iat_data { - double last_req_stamp; - double average_iat; - iat_data() : last_req_stamp(0), average_iat(0) {} - }; - private: - double alpha; - hash_map iat_map; - - public: - IATAverager(double a) : alpha(a) {} - - void add_sample(object_t oid, double now) { - iat_data &r = iat_map[oid]; - double iat = now - r.last_req_stamp; - r.last_req_stamp = now; - r.average_iat = r.average_iat*(1.0-alpha) + iat*alpha; - } - - bool have(object_t oid) const { - return iat_map.count(oid); - } - - double get_average_iat(object_t oid) const { - hash_map::const_iterator p = iat_map.find(oid); - assert(p != iat_map.end()); - return p->second.average_iat; - } - - bool is_flash_crowd_candidate(object_t oid) const { - return get_average_iat(oid) <= g_conf.osd_flash_crowd_iat_threshold; - } - }; - - - /** OSD **/ -protected: - Mutex osd_lock; // global lock - SafeTimer timer; // safe timer - - Messenger *messenger; - Logger *logger; - ObjectStore *store; - MonMap *monmap; - - LoadCalculator load_calc; - IATAverager iat_averager; - - int whoami; - char dev_path[100]; - -public: - int get_nodeid() { return whoami; } - -private: - /** superblock **/ - OSDSuperblock superblock; - epoch_t boot_epoch; - - object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - // -- state -- - int state; - -public: - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - -private: - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - - // -- stats -- - int hb_stat_ops; // ops since last heartbeat - int hb_stat_qlen; // cumulative queue length since last hb - - hash_map peer_qlen; - hash_map peer_read_time; - - - // -- waiters -- - list finished; - Mutex finished_lock; - - void take_waiters(list& ls) { - finished_lock.Lock(); - finished.splice(finished.end(), ls); - finished_lock.Unlock(); - } - - // -- op queue -- - class ThreadPool *threadpool; - hash_map > op_queue; - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(pg_t pgid, Message *op); - void dequeue_op(pg_t pgid); - static void static_dequeueop(OSD *o, pg_t pgid) { - o->dequeue_op(pgid); - }; - - - friend class PG; - friend class ReplicatedPG; - friend class RAID4PG; - - - protected: - - // -- osd map -- - OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- placement groups -- - hash_map pg_map; - hash_map > waiting_for_pg; - - // per-pg locking (serializes AND acquired pg lock) - hash_set pg_lock; - hash_map > pg_lock_waiters; - - PG *_lock_pg(pg_t pgid); - void _unlock_pg(pg_t pgid); - - PG *_create_lock_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - bool _have_pg(pg_t pgid); - void _remove_unlock_pg(PG *pg); // remove from store and memory - - void load_pgs(); - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from); - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - // -- tids -- - // for ops i issue - tid_t last_tid; - - Mutex tid_lock; - tid_t get_tid() { - tid_t t; - tid_lock.Lock(); - t = ++last_tid; - tid_lock.Unlock(); - return t; - } - - - // -- generic pg recovery -- - int num_pulling; - - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void repeer(PG *pg, map< int, map >& query_map); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_remove(class MOSDPGRemove *m); - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/branches/sage/pgs/osd/OSDMap.h b/branches/sage/pgs/osd/OSDMap.h deleted file mode 100644 index f0e0ff301f813..0000000000000 --- a/branches/sage/pgs/osd/OSDMap.h +++ /dev/null @@ -1,515 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/crush.h" -using namespace crush; - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL<>1)); -} - -inline int calc_bits_of(int t) { - int b = 0; - while (t) { - t = t >> 1; - b++; - } - return b; -} - - - -/** OSDMap - */ -class OSDMap { - -public: - class Incremental { - public: - epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; - - // full (rare) - bufferlist fullmap; // in leiu of below. - - // incremental - map new_up; - map new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(mon_epoch, bl); - ::_encode(ctime, bl); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - ::_encode(fullmap, bl); - } - void decode(bufferlist& bl, int& off) { - ::_decode(epoch, bl, off); - ::_decode(mon_epoch, bl, off); - ::_decode(ctime, bl, off); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - ::_decode(fullmap, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int pg_num; // placement group count - int pg_num_mask; // bitmask for above - int localized_pg_num; // localized place group count - int localized_pg_num_mask; // ditto - - set osds; // all osds - set down_osds; // list of down disks - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - Crush crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), - pg_num(1<<5), - localized_pg_num(1<<3) { - calc_pg_masks(); - } - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - void calc_pg_masks() { - pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; - localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1; - } - - int get_pg_num() const { return pg_num; } - void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } - int get_localized_pg_num() const { return localized_pg_num; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 2; } - bool post_mkfs() const { return epoch > 2; } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const set& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool exists(int osd) { return osds.count(osd); } - bool is_down(int osd) { return down_osds.count(osd); } - bool is_up(int osd) { return exists(osd) && !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return exists(osd) && !is_out(osd); } - - bool have_inst(int osd) { - return osd_inst.count(osd); - } - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o) { down_osds.insert(o); } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - // full map? - if (inc.fullmap.length()) { - decode(inc.fullmap); - return; - } - - // nope, incremental. - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds.insert(i->first); - assert(osd_inst.count(i->first) == 0 || - osd_inst[i->first] == i->second); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - } - - // serialize, unserialize - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(mon_epoch, blist); - ::_encode(ctime, blist); - ::_encode(pg_num, blist); - ::_encode(localized_pg_num, blist); - - ::_encode(osds, blist); - ::_encode(down_osds, blist); - ::_encode(out_osds, blist); - ::_encode(overload_osds, blist); - ::_encode(osd_inst, blist); - - crush._encode(blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(mon_epoch, blist, off); - ::_decode(ctime, blist, off); - ::_decode(pg_num, blist, off); - ::_decode(localized_pg_num, blist, off); - calc_pg_masks(); - - ::_decode(osds, blist, off); - ::_decode(down_osds, blist, off); - ::_decode(out_osds, blist, off); - ::_decode(overload_osds, blist, off); - ::_decode(osd_inst, blist, off); - - crush._decode(blist, off); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) { - return make_object_layout(oid, layout.pg_type, layout.pg_size, layout.preferred, layout.object_stripe_unit); - } - - ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - static crush::Hash H(777); - - // calculate ps (placement seed) - ps_t ps; - switch (g_conf.osd_object_layout) { - case OBJECT_LAYOUT_LINEAR: - ps = stable_mod(oid.bno + oid.ino, pg_num, pg_num_mask); - break; - - case OBJECT_LAYOUT_HASHINO: - ps = stable_mod(oid.bno + H(oid.ino), pg_num, pg_num_mask); - break; - - case OBJECT_LAYOUT_HASH: - ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), pg_num, pg_num_mask); - break; - - default: - assert(0); - } - - // construct object layout - return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred), - object_stripe_unit); - } - - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - // map to osds[] - switch (g_conf.osd_pg_layout) { - case PG_LAYOUT_CRUSH: - { - // what crush rule? - int rule; - if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size()); - else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size()); - else assert(0); - - // forcefeed? - int forcefeed = -1; - if (pg.preferred() >= 0 && - out_osds.count(pg.preferred()) == 0) - forcefeed = pg.preferred(); - crush.do_rule(crush.rules[rule], - pg.ps(), - osds, - out_osds, overload_osds, - forcefeed); - } - break; - - case PG_LAYOUT_LINEAR: - for (int i=0; i= 0 && - g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) { - int osd = pg.preferred(); - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(pg.size() > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/branches/sage/pgs/osd/ObjectStore.cc b/branches/sage/pgs/osd/ObjectStore.cc deleted file mode 100644 index ac81c6a3b4ea0..0000000000000 --- a/branches/sage/pgs/osd/ObjectStore.cc +++ /dev/null @@ -1,151 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << endl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - } - dout(1) << "age finished" << endl; - } - diff --git a/branches/sage/pgs/osd/ObjectStore.h b/branches/sage/pgs/osd/ObjectStore.h deleted file mode 100644 index c373ba32899b9..0000000000000 --- a/branches/sage/pgs/osd/ObjectStore.h +++ /dev/null @@ -1,539 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - list ops; - list bls; - list oids; - list cids; - list offsets; - list lengths; - list attrnames; - list attrnames2; - //list< pair > attrvals; - list attrbls; - - // for reads only (not encoded) - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - const char *get_attrname() { - if (attrnames.empty()) - return attrnames2.front().c_str(); - else - return attrnames.front(); - } - void pop_attrname() { - if (attrnames.empty()) - attrnames2.pop_front(); - else - attrnames.pop_front(); - } - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, const bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - attrbls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val, len); - attrbls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - - void _encode(bufferlist& bl) { - ::_encode(ops, bl); - ::_encode(bls, bl); - ::_encode(oids, bl); - ::_encode(cids, bl); - ::_encode(offsets, bl); - ::_encode(lengths, bl); - ::_encode(attrnames, bl); - ::_encode(attrbls, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ops, bl, off); - ::_decode(bls, bl, off); - ::_decode(oids, bl, off); - ::_decode(cids, bl, off); - ::_decode(offsets, bl, off); - ::_decode(lengths, bl, off); - ::_decode(attrnames2, bl, off); - ::_decode(attrbls, bl, off); - } - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - virtual int write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - virtual int is_cached(object_t oid, - off_t offset, - size_t len) { return -1; } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/branches/sage/pgs/osd/PG.cc b/branches/sage/pgs/osd/PG.cc deleted file mode 100644 index c2d1290102e8b..0000000000000 --- a/branches/sage/pgs/osd/PG.cc +++ /dev/null @@ -1,1229 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << dendl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << dendl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << dendl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << dendl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << dendl; - - //cout << "log" << dendl; - //log.print(cout); - //cout << "olog" << dendl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << dendl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << dendl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << dendl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << dendl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << dendl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << dendl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << dendl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << dendl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << dendl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << dendl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << dendl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << dendl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << dendl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << dendl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << dendl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << dendl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << dendl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << dendl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << dendl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << dendl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << dendl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << dendl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << dendl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << dendl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - clean_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << dendl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << dendl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << dendl; - continue; - } - - dout(10) << " querying info from osd" << *it << dendl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl; - if (omap.is_up(*i)) - still_up.insert(*i); - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl; - } - - if (last_started.empty()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } else { - dout(10) << " still active from last started: " << last_started << dendl; - } - } else if (osd->osdmap->post_mkfs()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << dendl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << dendl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << dendl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << dendl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << dendl; - } - - dout(10) << " oldest_update " << oldest_update << dendl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << dendl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << dendl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << dendl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t); - } -} - - -void PG::activate(ObjectStore::Transaction& t) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << dendl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - //else if (is_primary()) { - else if (true) { - dout(10) << "activate - not complete, " << missing << ", starting recovery" << dendl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - // start recovery - log.requested_to = log.complete_to; - do_recovery(); - } else { - dout(10) << "activate - not complete, " << missing << dendl; - } - - - // if primary.. - if (role == 0 && - osd->osdmap->post_mkfs()) { - // who is clean? - clean_set.clear(); - if (info.is_clean()) - clean_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; iosdmap->get_epoch(), - info.pgid); - m->info = info; - - if (peer_info[peer].last_update == info.last_update) { - // empty log - } - else if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } - else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - - // update local version of peer's missing list! - { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << dendl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << dendl; - assert(peer_info[peer].last_complete == info.last_update); - clean_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << dendl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_clean()) { - state_set(STATE_CLEAN); - dout(10) << "activate all replicas clean" << dendl; - clean_replicas(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << dendl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - // waiters - osd->take_waiters(waiting_for_active); -} - - - - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << dendl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << dendl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << dendl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << dendl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << dendl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - - - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool PG::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = osd->store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //cout << "getattr returns " << len << " on " << oid << dendl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - - -// ======================= -// revisions - - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool PG::pick_missing_object_rev(object_t& oid) -{ - map::iterator p = missing.missing.upper_bound(oid); - if (p == missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool PG::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!osd->store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = osd->store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl; - oid = t; - return true; - } - - return false; -} - - - - - diff --git a/branches/sage/pgs/osd/PG.h b/branches/sage/pgs/osd/PG.h deleted file mode 100644 index e591bd3f457f0..0000000000000 --- a/branches/sage/pgs/osd/PG.h +++ /dev/null @@ -1,712 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "osd_types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -class OSD; -class MOSDOp; -class MOSDOpReply; - - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_clean() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - objectrev_t rev; - - osdreqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const osdreqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const osdreqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - -protected: - OSD *osd; - - /** locking and reference counting. - * I destroy myself when the reference count hits zero. - * lock() should be called before doing anything. - * get() should be called on pointer copy (to another thread, etc.). - * put() should be called on destruction of some previously copied pointer. - * put_unlock() when done with the current pointer (_most common_). - */ - Mutex _lock; - int ref; - bool deleted; - -public: - void lock() { - //cout << info.pgid << " lock" << endl; - _lock.Lock(); - } - void get() { - //cout << info.pgid << " get " << ref << endl; - assert(_lock.is_locked()); - ++ref; - } - void put() { - //cout << info.pgid << " put " << ref << endl; - assert(_lock.is_locked()); - --ref; - assert(ref > 0); // last put must be a put_unlock. - } - void put_unlock() { - //cout << info.pgid << " put_unlock " << ref << endl; - assert(_lock.is_locked()); - --ref; - _lock.Unlock(); - if (ref == 0) delete this; - } - - void mark_deleted() { deleted = true; } - bool is_deleted() { return deleted; } - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set clean_set; // current OSDs that are clean - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - hash_map > waiting_for_wr_unlock; - - bool block_if_wrlocked(MOSDOp* op); - - - // recovery - map objects_pulling; // which objects are currently being pulled - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, map< int, map >& query_map); - - void activate(ObjectStore::Transaction& t); - - virtual void clean_up_local(ObjectStore::Transaction& t) = 0; - - virtual void cancel_recovery() = 0; - virtual bool do_recovery() = 0; - virtual void clean_replicas() = 0; - - off_t get_log_write_pos() { - return 0; - } - - friend class C_OSD_RepModify_Commit; - - public: - PG(OSD *o, pg_t p) : - osd(o), - ref(0), deleted(false), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true) - { } - virtual ~PG() { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return is_primary(); - else - return role == PG_ROLE_ACKER; - } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - bool is_dup(osdreqid_t rid) { - return log.logged_req(rid); - } - - - bool pick_missing_object_rev(object_t& oid); - bool pick_object_rev(object_t& oid); - - - - // abstract bits - virtual bool preprocess_op(MOSDOp *op) { return false; } - virtual void do_op(MOSDOp *op) = 0; - virtual void do_op_reply(MOSDOpReply *op) = 0; - - virtual bool same_for_read_since(epoch_t e) = 0; - virtual bool same_for_modify_since(epoch_t e) = 0; - virtual bool same_for_rep_modify_since(epoch_t e) = 0; - - virtual bool is_missing_object(object_t oid) = 0; - virtual void wait_for_missing_object(object_t oid, MOSDOp *op) = 0; - - virtual void note_failed_osd(int osd) = 0; - - virtual void on_acker_change() = 0; - virtual void on_role_change() = 0; -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << "pginfo(" << pgi.pgid; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - - -#endif diff --git a/branches/sage/pgs/osd/RAID4PG.cc b/branches/sage/pgs/osd/RAID4PG.cc deleted file mode 100644 index 48aa519a4ffbe..0000000000000 --- a/branches/sage/pgs/osd/RAID4PG.cc +++ /dev/null @@ -1,124 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "RAID4PG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "config.h" - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - - - - -void RAID4PG::do_op(MOSDOp *op) -{ - - -} - - - -void RAID4PG::do_op_reply(MOSDOpReply *reply) -{ - -} - - - -// ----------------- -// pg changes - -bool RAID4PG::same_for_read_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_rep_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - - -// ----------------- -// RECOVERY - -bool RAID4PG::is_missing_object(object_t oid) -{ - return false; -} - -void RAID4PG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - assert(0); -} - -void RAID4PG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd osd" << o << dendl; - assert(0); -} - -void RAID4PG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - assert(0); -} - - -void RAID4PG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - assert(0); -} - - -void RAID4PG::clean_up_local(ObjectStore::Transaction&) -{ -} - -void RAID4PG::cancel_recovery() -{ - assert(0); -} - -bool RAID4PG::do_recovery() -{ - assert(0); - return false; -} - -void RAID4PG::clean_replicas() -{ - assert(0); -} - - - diff --git a/branches/sage/pgs/osd/RAID4PG.h b/branches/sage/pgs/osd/RAID4PG.h deleted file mode 100644 index 9c75118c06069..0000000000000 --- a/branches/sage/pgs/osd/RAID4PG.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __RAID4PG_H -#define __RAID4PG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class RAID4PG : public PG { -public: - -protected: - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - void op_stat(MOSDOp *op); - int op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - - -public: - RAID4PG(OSD *o, pg_t p) : PG(o,p) { } - - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int osd); - - void on_acker_change(); - void on_role_change(); - - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - - void clean_replicas(); - - -}; - - -#endif diff --git a/branches/sage/pgs/osd/ReplicatedPG.cc b/branches/sage/pgs/osd/ReplicatedPG.cc deleted file mode 100644 index c802c00f1ac83..0000000000000 --- a/branches/sage/pgs/osd/ReplicatedPG.cc +++ /dev/null @@ -1,1807 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ReplicatedPG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "config.h" - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - -static const int LOAD_LATENCY = 1; -static const int LOAD_QUEUE_SIZE = 2; -static const int LOAD_HYBRID = 3; - - -// ======================= -// pg changes - -bool ReplicatedPG::same_for_read_since(epoch_t e) -{ - return (e >= info.history.same_acker_since); -} - -bool ReplicatedPG::same_for_modify_since(epoch_t e) -{ - return (e >= info.history.same_primary_since); -} - -bool ReplicatedPG::same_for_rep_modify_since(epoch_t e) -{ - // check osd map: same set, or primary+acker? - - if (g_conf.osd_rep == OSD_REP_CHAIN) { - return e >= info.history.same_since; // whole pg set same - } else { - // primary, splay - return (e >= info.history.same_primary_since && - e >= info.history.same_acker_since); - } -} - -// ==================== -// missing objects - -bool ReplicatedPG::is_missing_object(object_t oid) -{ - return missing.missing.count(oid); -} - - -void ReplicatedPG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - assert(is_missing_object(oid)); - - // we don't have it (yet). - eversion_t v = missing.missing[oid]; - if (objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << ", already pulling" - << dendl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << ", pulling" - << dendl; - pull(oid); - } - waiting_for_missing_object[oid].push_back(op); -} - - - - -/** preprocess_op - preprocess an op (before it gets queued). - * fasttrack read - */ -bool ReplicatedPG::preprocess_op(MOSDOp *op) -{ - // we only care about reads here on out.. - if (!op->is_read()) - return false; - - - // -- load balance reads -- - if (g_conf.osd_balance_reads && - is_primary() && - g_conf.osd_rep == OSD_REP_PRIMARY) { - // -- read on primary+acker --- - - // test - if (false) { - if (acting.size() > 1) { - int peer = acting[1]; - dout(-10) << "preprocess_op fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - - // -- flash crowd? - if (!op->get_source().is_osd() && - is_primary()) { - // add sample - osd->iat_averager.add_sample( op->get_oid(), (double)g_clock.now() ); - - // candidate? - bool is_flash_crowd_candidate = osd->iat_averager.is_flash_crowd_candidate( op->get_oid() ); - bool is_balanced = false; - bool b; - if (osd->store->getattr(op->get_oid(), "balance-reads", &b, 1) >= 0) - is_balanced = true; - - if (!is_balanced && is_flash_crowd_candidate && - balancing_reads.count(op->get_oid()) == 0) { - dout(-10) << "preprocess_op balance-reads on " << op->get_oid() << dendl; - balancing_reads.insert(op->get_oid()); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_BALANCEREADS); - do_op(pop); - } - if (is_balanced && !is_flash_crowd_candidate && - !unbalancing_reads.count(op->get_oid()) == 0) { - dout(-10) << "preprocess_op unbalance-reads on " << op->get_oid() << dendl; - unbalancing_reads.insert(op->get_oid()); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - } - - - // check my load. - // TODO xxx we must also compare with our own load - // if i am x percentage higher than replica , - // redirect the read - - if ( g_conf.osd_balance_reads == LOAD_LATENCY) { - double mean_read_time = osd->load_calc.get_average(); - - if ( mean_read_time != -1 ) { - - for (unsigned i=1; - ipeer_read_time[peer] - << " of peer" << peer << dendl; - - if ( osd->peer_read_time.count(peer) && - ( (osd->peer_read_time[peer]*100/mean_read_time) < - (100 - g_conf.osd_load_diff_percent))) { - dout(10) << " forwarding to peer osd" << peer << dendl; - - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - } - } - else if ( g_conf.osd_balance_reads == LOAD_QUEUE_SIZE ) { - // am i above my average? - float my_avg = osd->hb_stat_qlen / osd->hb_stat_ops; - - if (osd->pending_ops > my_avg) { - // is there a peer who is below my average? - for (unsigned i=1; ipeer_qlen.count(peer) && - osd->peer_qlen[peer] < my_avg) { - // calculate a probability that we should redirect - float p = (my_avg - osd->peer_qlen[peer]) / my_avg; // this is dumb. - - if (drand48() <= p) { - // take the first one - dout(10) << "my qlen " << osd->pending_ops << " > my_avg " << my_avg - << ", p=" << p - << ", fwd to peer w/ qlen " << osd->peer_qlen[peer] - << " osd" << peer - << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - } - } - } - - else if ( g_conf.osd_balance_reads == LOAD_HYBRID ) { - // am i above my average? - float my_avg = osd->hb_stat_qlen / osd->hb_stat_ops; - - if (osd->pending_ops > my_avg) { - // is there a peer who is below my average? - for (unsigned i=1; ipeer_qlen.count(peer) && - osd->peer_qlen[peer] < my_avg) { - // calculate a probability that we should redirect - //float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb. - - double mean_read_time = osd->load_calc.get_average(); - - if ( mean_read_time != -1 && - osd->peer_read_time.count(peer) && - ( (osd->peer_read_time[peer]*100/mean_read_time) < - ( 100 - g_conf.osd_load_diff_percent) ) ) - //if (drand48() <= p) { - // take the first one - dout(10) << "using hybrid :my qlen " << osd->pending_ops << " > my_avg " << my_avg - << "my read time "<< mean_read_time - << "peer read time " << osd->peer_read_time[peer] - << ", fwd to peer w/ qlen " << osd->peer_qlen[peer] - << " osd" << peer - << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - } - } - } // endif balance reads - - - // -- fastpath read? - // if this is a read and the data is in the cache, do an immediate read.. - if ( g_conf.osd_immediate_read_from_cache ) { - if (osd->store->is_cached( op->get_oid() , - op->get_offset(), - op->get_length() ) == 0) { - if (!is_primary()) { - // am i allowed? - bool v; - if (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) < 0) { - dout(10) << "preprocess_op in-cache but no balance-reads on " << op->get_oid() - << ", fwd to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_primary())); - return true; - } - } - - // do it now - dout(-10) << "preprocess_op data is in cache, reading from cache" << *op << dendl; - do_op(op); - return true; - } - } - - return false; -} - - -/** do_op - do an op - * pg lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void ReplicatedPG::do_op(MOSDOp *op) -{ - //dout(15) << "do_op " << *op << dendl; - - osd->logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - case OSD_OP_STAT: - op_read(op); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op); - break; - case OSD_OP_PUSH: - op_push(op); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - case OSD_OP_BALANCEREADS: - case OSD_OP_UNBALANCEREADS: - if (op->get_source().is_osd()) { - op_rep_modify(op); - } else { - // go go gadget pg - op_modify(op); - - if (op->get_op() == OSD_OP_WRITE) { - osd->logger->inc("c_wr"); - osd->logger->inc("c_wrb", op->get_length()); - } - } - break; - - default: - assert(0); - } -} - -void ReplicatedPG::do_op_reply(MOSDOpReply *r) -{ - // must be replication. - tid_t rep_tid = r->get_rep_tid(); - - if (rep_gather.count(rep_tid)) { - // oh, good. - int fromosd = r->get_source().num(); - repop_ack(rep_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete r; - } else { - // early ack. - waiting_for_repop[rep_tid].push_back(r); - } -} - - - - -// ======================================================================== -// READS - -void ReplicatedPG::op_read(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - dout(10) << "op_read " << MOSDOp::get_opname(op->get_op()) - << " " << oid - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // wrlocked? - if (block_if_wrlocked(op)) - return; - - // !primary and unbalanced? - // (ignore ops forwarded from the primary) - if (!is_primary() && - !(op->get_source().is_osd() && - op->get_source().num() == get_primary())) { - // make sure i exist and am balanced, otherwise fw back to acker. - bool b; - if (!osd->store->exists(oid) || - osd->store->getattr(oid, "balance-reads", &b, 1) < 0) { - dout(-10) << "read on replica, object " << oid - << " dne or no balance-reads, fw back to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_acker())); - return; - } - } - - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - long r = 0; - - // do it. - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - switch (op->get_op()) { - case OSD_OP_READ: - { - // read into a buffer - bufferlist bl; - r = osd->store->read(oid, - op->get_offset(), op->get_length(), - bl); - reply->set_data(bl); - reply->set_length(r); - dout(15) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl; - } - break; - - case OSD_OP_STAT: - { - struct stat st; - memset(&st, sizeof(st), 0); - r = osd->store->stat(oid, &st); - if (r >= 0) - reply->set_object_size(st.st_size); - } - break; - - default: - assert(0); - } - } - - if (r >= 0) { - reply->set_result(0); - - dout(10) << "READ TIME DIFF" - << (double)g_clock.now()-op->get_received_time() - << dendl; - osd->load_calc.add((double)g_clock.now() - op->get_received_time()); - - } else { - reply->set_result(r); // error - } - - - // send it - osd->messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - - - - - -// ======================================================================== -// MODIFY - -void ReplicatedPG::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << dendl; - } - - // actual op - int opcode = Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = Log::Entry::DELETE; - Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << dendl; - - // append to log - assert(version > log.top); - log.add(logentry); - assert(log.top == version); - dout(10) << "prepare_log_transaction appended" << dendl; - - // write to pg log on disk - append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void ReplicatedPG::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << dendl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (info.last_complete == info.last_update) - info.last_complete = version; - - // raise last_update. - assert(version > info.last_update); - info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &info, sizeof(info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - - // -- locking -- - - case OSD_OP_WRLOCK: - { // lock object - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - case OSD_OP_WRUNLOCK: - { // unlock objects - t.rmattr(oid, "wrlock"); - } - break; - - case OSD_OP_MININCLOCK: - { - uint32_t mininc = op->get_length(); - t.setattr(oid, "mininclock", &mininc, sizeof(mininc)); - } - break; - - case OSD_OP_BALANCEREADS: - { - bool bal = true; - t.setattr(oid, "balance-reads", &bal, sizeof(bal)); - } - break; - case OSD_OP_UNBALANCEREADS: - { - t.rmattr(oid, "balance-reads"); - } - break; - - - // -- modify -- - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - assert(0); // are you sure this is what you want? - // zero, remove, or truncate? - struct stat st; - int r = osd->store->stat(oid, &st); - if (r >= 0) { - if (op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} - - - -// ======================================================================== -// rep op gather - -class C_OSD_ModifyCommit : public Context { -public: - ReplicatedPG *pg; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_ModifyCommit(ReplicatedPG *p, tid_t rt, eversion_t lc) : pg(p), rep_tid(rt), pg_last_complete(lc) { - pg->get(); // we're copying the pointer - } - void finish(int r) { - pg->lock(); - if (!pg->is_deleted()) - pg->op_modify_commit(rep_tid, pg_last_complete); - pg->put_unlock(); - } -}; - - -void ReplicatedPG::get_rep_gather(RepGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << dendl; -} - -void ReplicatedPG::apply_repop(RepGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << dendl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_ModifyCommit(this, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = osd->store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl; - - // discard my reference to the buffer - repop->op->get_data().clear(); - - repop->applied = true; - - - // any completion stuff to do here? - object_t oid = repop->op->get_oid(); - - switch (repop->op->get_op()) { - case OSD_OP_UNBALANCEREADS: - dout(-10) << "apply_repop completed unbalance-reads on " << oid << dendl; - unbalancing_reads.erase(oid); - if (waiting_for_unbalanced_reads.count(oid)) { - osd->take_waiters(waiting_for_unbalanced_reads[oid]); - waiting_for_unbalanced_reads.erase(oid); - } - break; - - case OSD_OP_BALANCEREADS: - dout(-10) << "apply_repop completed balance-reads on " << oid << dendl; - /* - if (waiting_for_balanced_reads.count(oid)) { - osd->take_waiters(waiting_for_balanced_reads[oid]); - waiting_for_balanced_reads.erase(oid); - } - */ - break; - - case OSD_OP_WRUNLOCK: - dout(-10) << "apply_repop completed wrunlock on " << oid << dendl; - if (waiting_for_wr_unlock.count(oid)) { - osd->take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - break; - } - - -} - -void ReplicatedPG::put_rep_gather(RepGather *repop) -{ - dout(10) << "put_repop " << *repop << dendl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(repop); - - // send ack - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - osd->logger->finc("rlsum", now); - osd->logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = info.last_complete; // hrm.... - for (unsigned i=0; ipg_complete_thru[acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[acting[i]]; - } - - if (min > peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " - << peers_complete_thru << " -> " << min - << dendl; - peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << dendl; - //repop->lock.Unlock(); - - assert(rep_gather.count(repop->rep_tid)); - rep_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - - } else { - //repop->lock.Unlock(); - } -} - - -void ReplicatedPG::issue_repop(MOSDOp *op, int dest) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " o " << oid - << " to osd" << dest - << dendl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(peers_complete_thru); - - osd->messenger->send_message(wr, osd->osdmap->get_inst(dest)); -} - -ReplicatedPG::RepGather *ReplicatedPG::new_rep_gather(MOSDOp *op) -{ - dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << dendl; - int whoami = osd->get_nodeid(); - - RepGather *repop = new RepGather(op, op->get_rep_tid(), - op->get_version(), - info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iosds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osd->osdmap->calc_pg_rank(whoami, acting); - if (myrank > 0) { - int osd = acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iwaitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - rep_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (waiting_for_repop.count(repop->rep_tid)) { - osd->take_waiters(waiting_for_repop[repop->rep_tid]); - waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void ReplicatedPG::repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << dendl; - - get_rep_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_rep_gather(repop); -} - - - - - - - - - - - - - - - - - - - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void ReplicatedPG::op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru) -{ - if (rep_gather.count(rep_tid)) { - RepGather *repop = rep_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << dendl; - get_rep_gather(repop); - { - assert(repop->waitfor_commit.count(osd->get_nodeid())); - repop->waitfor_commit.erase(osd->get_nodeid()); - repop->pg_complete_thru[osd->get_nodeid()] = pg_complete_thru; - } - put_rep_gather(repop); - dout(10) << "op_modify_commit done on " << repop << dendl; - } else { - dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << dendl; - } -} - - - -objectrev_t ReplicatedPG::assign_version(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osd->osdmap->get_epoch(); - nv.version++; - assert(nv > info.last_update); - assert(nv > log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - return crev; -} - - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - ReplicatedPG *pg; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(ReplicatedPG *p, MOSDOp *oo, int dosd, eversion_t lc) : - pg(p), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { - pg->get(); // we're copying the pointer. - } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - - pg->lock(); - pg->op_rep_modify_commit(op, destosd, pg_last_complete); - pg->put_unlock(); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - - -void ReplicatedPG::op_modify(MOSDOp *op) -{ - int whoami = osd->get_nodeid(); - object_t oid = op->get_oid(); - const char *opname = MOSDOp::get_opname(op->get_op()); - - // --- locking --- - - // wrlock? - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - // balance-reads set? - char v; - if ((op->get_op() != OSD_OP_BALANCEREADS && op->get_op() != OSD_OP_UNBALANCEREADS) && - (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) >= 0 || - balancing_reads.count(op->get_oid()))) { - - if (!unbalancing_reads.count(op->get_oid())) { - // unbalance - dout(-10) << "preprocess_op unbalancing-reads on " << op->get_oid() << dendl; - unbalancing_reads.insert(op->get_oid()); - - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - - // add to wait queue - dout(-10) << "preprocess_op waiting for unbalance-reads on " << op->get_oid() << dendl; - waiting_for_unbalanced_reads[op->get_oid()].push_back(op); - return; - } - - - // share latest osd map with rest of pg? - osd->osd_lock.Lock(); - { - for (unsigned i=1; i_share_map_outgoing( osd->osdmap->get_inst(acting[i]) ); - } - } - osd->osd_lock.Unlock(); - - - // dup op? - if (is_dup(op->get_reqid())) { - dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << dendl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // assign the op a version - objectrev_t crev = assign_version(op); - eversion_t nv = op->get_version(); - - // are any peers missing this? - for (unsigned i=1; iget_rev() - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // issue replica writes - RepGather *repop = 0; - bool alone = (acting.size() == 1); - tid_t rep_tid = osd->get_tid(); - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = acting[1]; - if (acting.size() > 2) - next = acting[2]; - issue_repop(op, next); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; i=1; --i) - issue_repop(op, acting[i]); - } else { - // primary rep, or alone. - repop = new_rep_gather(op); - - // send to rest. - if (!alone) - for (unsigned i=1; iget_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_rep_gather(repop); - - } else { - // not acker. - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, get_acker(), - info.last_complete); - unsigned r = osd->store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << r << dendl; - assert(r == 0); - } - - // lets evict the data from our cache to maintain a total large cache size - if (g_conf.osd_exclusive_caching) - osd->store->trim_from_cache(op->get_oid(), op->get_offset(), op->get_length()); - - oncommit->ack(); - } - - - -} - - - -// replicated - - - - -void ReplicatedPG::op_rep_modify(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // we better not be missing this. - assert(!missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - RepGather *repop = 0; - int ackerosd = acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = get_acker(); - - if (is_acker()) { - // i am tail acker. - if (rep_gather.count(op->get_rep_tid())) { - repop = rep_gather[ op->get_rep_tid() ]; - } else { - repop = new_rep_gather(op); - } - - // infer ack from source - int fromosd = op->get_source().num(); - get_rep_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_rep_gather(repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !is_acker()) { - // chain rep, not at the tail yet. - int myrank = osd->osdmap->calc_pg_rank(osd->get_nodeid(), acting); - int next = myrank+1; - if (next == (int)acting.size()) - next = 1; - issue_repop(op, acting[next]); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - osd->logger->inc("r_wr"); - osd->logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = osd->store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << tr << dendl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(osd->get_nodeid())); - repop->waitfor_ack.erase(osd->get_nodeid()); - } - put_rep_gather(repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), false); - osd->messenger->send_message(ack, osd->osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } - -} - - -void ReplicatedPG::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << dendl; - MOSDOpReply *commit = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - osd->messenger->send_message(commit, osd->osdmap->get_inst(ackerosd)); - delete op; -} - - - - - - - - - - -// =========================================================== - -/** pull - request object from a peer - */ -void ReplicatedPG::pull(object_t oid) -{ - assert(missing.loc.count(oid)); - eversion_t v = missing.missing[oid]; - int fromosd = missing.loc[oid]; - - dout(7) << "pull " << oid - << " v " << v - << " from osd" << fromosd - << dendl; - - // send op - tid_t tid = osd->get_tid(); - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, tid, - oid, info.pgid, - osd->osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - osd->messenger->send_message(op, osd->osdmap->get_inst(fromosd)); - - // take note - assert(objects_pulling.count(oid) == 0); - num_pulling++; - objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void ReplicatedPG::push(object_t oid, int dest) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = osd->store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << "push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << dest - << dendl; - - osd->logger->inc("r_push"); - osd->logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, info.pgid, osd->osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - osd->messenger->send_message(op, osd->osdmap->get_inst(dest)); -} - - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_pull(MOSDOp *op) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << "op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << dendl; - - // is a replica asking? are they missing it? - if (is_primary()) { - // primary - assert(peer_missing.count(from)); // we had better know this, from the peering process. - - if (!peer_missing[from].is_missing(oid)) { - dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } - - // do we have it yet? - if (is_missing_object(oid)) { - wait_for_missing_object(oid, op); - return; - } - } else { - // non-primary - if (missing.is_missing(oid)) { - dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << dendl; - delete op; - return; - } - } - - // push it back! - push(oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_push(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!is_missing_object(oid)) { - dout(7) << "op_push not missing " << oid << dendl; - return; - } - - dout(7) << "op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(info.pgid, oid); - - // close out pull op? - num_pulling--; - if (objects_pulling.count(oid)) - objects_pulling.erase(oid); - missing.got(oid, v); - - - // raise last_complete? - assert(log.complete_to != log.log.end()); - while (log.complete_to != log.log.end()) { - if (missing.missing.count(log.complete_to->oid)) break; - if (info.last_complete < log.complete_to->version) - info.last_complete = log.complete_to->version; - log.complete_to++; - } - dout(10) << "last_complete now " << info.last_complete << dendl; - - - // apply to disk! - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); - unsigned r = osd->store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (is_primary()) { - for (unsigned i=1; itake_waiters(waiting_for_missing_object[oid]); - waiting_for_missing_object.erase(oid); - } - - delete op; -} - - - - - - -void ReplicatedPG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd " << o << dendl; - // do async; repop_ack() may modify pg->repop_gather - list ls; - for (map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << dendl; - if (p->second->waitfor_ack.count(o) || - p->second->waitfor_commit.count(o)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(*p, -1, true, o); -} - - -void ReplicatedPG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - - // apply repops - for (map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(p->second); - delete p->second->op; - delete p->second; - } - rep_gather.clear(); - - // and repop waiters - for (map >::iterator p = waiting_for_repop.begin(); - p != waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - waiting_for_repop.clear(); -} - - -void ReplicatedPG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - - // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); - it != waiting_for_missing_object.end(); - it++) - osd->take_waiters(it->second); - waiting_for_missing_object.clear(); -} - - - - - - - - - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << dendl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << dendl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void ReplicatedPG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool ReplicatedPG::do_recovery() -{ - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << dendl; - dout(10) << "do_recovery " << missing << dendl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << dendl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << dendl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - pull(latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << dendl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(7) << "do_recovery complete, cleaning strays" << dendl; - clean_set.insert(osd->whoami); - if (is_all_clean()) { - state_set(PG::STATE_CLEAN); - clean_replicas(); - } - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << dendl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void ReplicatedPG::do_peer_recovery() -{ - dout(10) << "do_peer_recovery" << dendl; - - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - push(oid, peer); - - // do other peers need it too? - for (i++; i::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << dendl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - diff --git a/branches/sage/pgs/osd/ReplicatedPG.h b/branches/sage/pgs/osd/ReplicatedPG.h deleted file mode 100644 index 3ee6e48039ff8..0000000000000 --- a/branches/sage/pgs/osd/ReplicatedPG.h +++ /dev/null @@ -1,169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REPLICATEDPG_H -#define __REPLICATEDPG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class ReplicatedPG : public PG { -public: - /* - * gather state on the primary/head while replicating an osd op. - */ - class RepGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - -protected: - // replica ops - // [primary|tail] - map rep_gather; - map > waiting_for_repop; - - // load balancing - set balancing_reads; - set unbalancing_reads; - hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock - - void get_rep_gather(RepGather*); - void apply_repop(RepGather *repop); - void put_rep_gather(RepGather*); - void issue_repop(MOSDOp *op, int osd); - RepGather *new_rep_gather(MOSDOp *op); - void repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - // push/pull - int num_pulling; - - void push(object_t oid, int dest); - void pull(object_t oid); - - // modify - objectrev_t assign_version(MOSDOp *op); - void op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru); - void op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete); - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - friend class C_OSD_ModifyCommit; - friend class C_OSD_RepModifyCommit; - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void clean_replicas(); - - - void op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - - - -public: - ReplicatedPG(OSD *o, pg_t p) : - PG(o,p), - num_pulling(0) - { } - ~ReplicatedPG() {} - - bool preprocess_op(MOSDOp *op); - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int o); - void on_acker_change(); - void on_role_change(); - -}; - - -inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop) -{ - out << "repgather(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/branches/sage/pgs/osd/osd_types.h b/branches/sage/pgs/osd/osd_types.h deleted file mode 100644 index 5bd13902ab721..0000000000000 --- a/branches/sage/pgs/osd/osd_types.h +++ /dev/null @@ -1,276 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "msg/msg_types.h" -#include "include/types.h" - -/* osdreqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class osdreqid_t { -public: - entity_name_t name; // who - int inc; // incarnation - tid_t tid; - osdreqid_t() : inc(0), tid(0) {} - osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const osdreqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const osdreqid_t& l, const osdreqid_t& r) { return !(l <= r); } -inline bool operator>=(const osdreqid_t& l, const osdreqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const osdreqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - - -// osd types -typedef uint64_t coll_t; // collection id - -// pg stuff - -#define PG_INO 1 - -typedef uint16_t ps_t; -typedef uint8_t pruleset_t; - - -// crush rule ids -#define CRUSH_REP_RULE(nrep) (100+nrep) // replication -#define CRUSH_RAID_RULE(num) (200+num) // raid - - - -// placement group id -struct pg_t { -public: - static const int TYPE_REP = 1; - static const int TYPE_RAID4 = 2; - -private: - union { - struct { - int32_t preferred; - uint8_t type; - uint8_t size; - uint16_t ps; - } fields; - uint64_t val; // 64 - } u; - -public: - pg_t() { u.val = 0; } - pg_t(const pg_t& o) { u.val = o.u.val; } - pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) { - u.fields.type = type; - u.fields.size = size; - u.fields.ps = seed; - u.fields.preferred = pref; // hack: avoid negative. - //u.fields.ruleset = r; - assert(sizeof(u.fields) == sizeof(u.val)); - } - pg_t(uint64_t v) { u.val = v; } - - int type() { return u.fields.type; } - bool is_rep() { return type() == TYPE_REP; } - bool is_raid4() { return type() == TYPE_RAID4; } - - int size() { return u.fields.size; } - ps_t ps() { return u.fields.ps; } - //pruleset_t ruleset() { return u.fields.ruleset; } - int preferred() { return u.fields.preferred; } // hack: avoid negative. - - /* - pg_t operator=(uint64_t v) { u.val = v; return *this; } - pg_t operator&=(uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator uint64_t() const { return u.val; } - - object_t to_object() const { return object_t(PG_INO, u.val >> 32, u.val & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) -{ - if (pg.is_rep()) - out << pg.size() << 'x'; - else if (pg.is_raid4()) - out << pg.size() << 'r'; - else - out << pg.size() << '?'; - - //if (pg.ruleset()) - //out << (int)pg.ruleset() << 's'; - - if (pg.preferred() >= 0) - out << pg.preferred() << 'p'; - out << hex << pg.ps() << dec; - - //out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static hash H; - return H(x); - } - }; -} - - -/** ObjectLayout - * - * describes an object's placement and layout in the storage cluster. - * most importatly, which pg it belongs to. - * if that pg is raided, it also specifies the object's stripe_unit. - */ -struct ObjectLayout { - pg_t pgid; // what pg do i belong to - int stripe_unit; // for object raid in raid pgs - - ObjectLayout() : pgid(0), stripe_unit(0) { } - ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { } -}; - -inline ostream& operator<<(ostream& out, const ObjectLayout &ol) -{ - out << "pg" << ol.pgid; - if (ol.stripe_unit) - out << ".su=" << ol.stripe_unit; - return out; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - - - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - objectrev_t rev; // which revision? - - ObjectLayout layout; // object layout (pgid, etc.) - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0), rev(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << ex.layout - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static uint64_t MAGIC = 0xeb0f505dULL; - uint64_t magic; - uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - OSDSuperblock(uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/branches/sage/pgs/osd/rush.cc b/branches/sage/pgs/osd/rush.cc deleted file mode 100644 index 733d71aa4b322..0000000000000 --- a/branches/sage/pgs/osd/rush.cc +++ /dev/null @@ -1,231 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.cc -// -// $Id$ -// - -#include -#include -#include -#include "rush.h" - - -static -unsigned int -myhash (unsigned int n) -{ - unsigned int v = (n ^ 0xdead1234) * (884811920 * 3 + 1); - return (v); -} - -Rush::Rush () -{ - nClusters = 0; - totalServers = 0; -} - -//---------------------------------------------------------------------- -// -// Rush::AddCluster -// -// Add a cluster. The number of servers in the cluster and -// the weight of each server is passed. The current number of -// clusters is returned. -// -//---------------------------------------------------------------------- -int -Rush::AddCluster (int nServers, double weight) -{ - clusterSize[nClusters] = nServers; - clusterWeight[nClusters] = weight; - if (nClusters == 0) { - serversInPrevious[0] = 0; - totalWeightBefore[0] = 0.0; - } else { - serversInPrevious[nClusters] = serversInPrevious[nClusters-1] + - clusterSize[nClusters-1]; - totalWeightBefore[nClusters] = - totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] * - clusterWeight[nClusters-1]; - } - nClusters += 1; - totalServers += nServers; -#if 0 - for (int i = 0; i < nClusters; i++) { - fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n", - clusterSize[i], serversInPrevious[i], clusterWeight[i], - totalWeightBefore[i]); - } -#endif - return (nClusters); -} - - -//---------------------------------------------------------------------- -// -// Rush::GetServersByKey -// -// This function returns a list of servers on which an object -// should be placed. The servers array must be large enough to -// contain the list. -// -//---------------------------------------------------------------------- -void -Rush::GetServersByKey (int key, int nReplicas, int servers[]) -{ - int replicasLeft = nReplicas; - int cluster; - int mustAssign, numberAssigned; - int i, toDraw; - int *srv = servers; - double myWeight; - RushRNG rng; - - // There may not be more replicas than servers! - assert (nReplicas <= totalServers); - - for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) { - if (serversInPrevious[cluster] < replicasLeft) { - mustAssign = replicasLeft - serversInPrevious[cluster]; - } else { - mustAssign = 0; - } - toDraw = replicasLeft - mustAssign; - if (toDraw > (clusterSize[cluster] - mustAssign)) { - toDraw = clusterSize[cluster] - mustAssign; - } - myWeight = (double)clusterSize[cluster] * clusterWeight[cluster]; - rng.Seed (myhash (key)^cluster, cluster^0xb90738); - numberAssigned = mustAssign + - rng.HyperGeometricWeighted (toDraw, myWeight, - totalWeightBefore[cluster] + myWeight, - clusterWeight[cluster]); - if (numberAssigned > 0) { - rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937); - rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]); - for (i = 0; i < numberAssigned; i++) { - srv[i] += serversInPrevious[cluster]; - } - replicasLeft -= numberAssigned; - srv += numberAssigned; - } - } -} - - - -//---------------------------------------------------------------------- -// -// RushRNG::HyperGeometricWeighted -// -// Use an iterative method to generate a hypergeometric random -// variable. This approach guarantees that, if the number of draws -// is reduced, the number of successes must be as well as long as -// the seed for the RNG is the same. -// -//---------------------------------------------------------------------- -int -RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne) -{ - int positives = 0, i; - double curRand; - - // If the weight is too small (or is negative), choose zero objects. - if (weightOne <= 1e-9 || nDraws == 0) { - return (0); - } - - // Draw nDraws items from the "bag". For each positive, subtract off - // the weight of an object from the weight of positives remaining. For - // each draw, subtract off the weight of an object from the total weight - // remaining. - for (i = 0; i < nDraws; i++) { - curRand = RandomDouble (); - if (curRand < (yesWeighted / totalWeighted)) { - positives += 1; - yesWeighted -= weightOne; - } - totalWeighted -= weightOne; - } - return (positives); -} - -//---------------------------------------------------------------------- -// -// RushRNG::DrawKofN -// -//---------------------------------------------------------------------- -void -RushRNG::DrawKofN (int vals[], int nToDraw, int setSize) -{ - int deck[setSize]; - int i, pick; - - assert(nToDraw <= setSize); - - for (i = 0; i < setSize; i++) { - deck[i] = i; - } - - for (i = 0; i < nToDraw; i++) { - pick = (int)(RandomDouble () * (double)(setSize - i)); - if (pick >= setSize-i) pick = setSize-i-1; // in case - // assert(i >= 0 && i < nToDraw); - // assert(pick >= 0 && pick < setSize); - vals[i] = deck[pick]; - deck[pick] = deck[setSize-i-1]; - } -} - -#define SEED_X 521288629 -#define SEED_Y 362436069 -RushRNG::RushRNG () -{ - Seed (0, 0); -} - -void -RushRNG::Seed (unsigned int seed1, unsigned int seed2) -{ - state1 = ((seed1 == 0) ? SEED_X : seed1); - state2 = ((seed2 == 0) ? SEED_Y : seed2); -} - -unsigned int -RushRNG::RandomInt () -{ - const unsigned int a = 18000; - const unsigned int b = 18879; - unsigned int rndValue; - - state1 = a * (state1 & 0xffff) + (state1 >> 16); - state2 = b * (state2 & 0xffff) + (state2 >> 16); - rndValue = (state1 << 16) + (state2 & 0xffff); - return (rndValue); -} - -double -RushRNG::RandomDouble () -{ - double v; - - v = (double)RandomInt() / (65536.0*65536.0); - return (v); -} diff --git a/branches/sage/pgs/osd/rush.h b/branches/sage/pgs/osd/rush.h deleted file mode 100644 index 4b43e1a9a1160..0000000000000 --- a/branches/sage/pgs/osd/rush.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.h -// -// Classes and definitions for the RUSH algorithm. -// -// $Id$ -// -// - -#ifndef _rush_h_ -#define _rush_h_ - -#define RUSH_MAX_CLUSTERS 100 - -class RushRNG { -public: - unsigned int RandomInt (); - double RandomDouble (); - void Seed (unsigned int a, unsigned int b); - int HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne); - void DrawKofN (int vals[], int nToDraw, int setSize); - RushRNG(); -private: - unsigned int state1, state2; -}; - -class Rush { -public: - void GetServersByKey (int key, int nReplicas, int servers[]); - int AddCluster (int nServers, double weight); - int Clusters () {return (nClusters);} - int Servers () {return (totalServers);} - Rush (); -private: - int DrawKofN (int *servers, int n, int clusterSize, RushRNG *g); - int nClusters; - int totalServers; - int clusterSize[RUSH_MAX_CLUSTERS]; - int serversInPrevious[RUSH_MAX_CLUSTERS]; - double clusterWeight[RUSH_MAX_CLUSTERS]; - double totalWeightBefore[RUSH_MAX_CLUSTERS]; -}; - -#endif /* _rush_h_ */ diff --git a/branches/sage/pgs/osd/tp.cc b/branches/sage/pgs/osd/tp.cc deleted file mode 100644 index b52e9a69df050..0000000000000 --- a/branches/sage/pgs/osd/tp.cc +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include - -using namespace std; - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -// #include - -class Op { - int i; - -public: - - Op(int i) - { - this->i = i; - } - - int get() - { - return i; - } -}; - -void foop(class TP *t, class Op *o); - -class TP { -public: - - void foo(Op *o) - { - cout << "Thread "<< pthread_self() << ": " << o->get() << "\n"; - usleep(1); - - // sched_yield(); - } - - int main(int argc, char *argv) - { - ThreadPool *t = new ThreadPool(10, (void (*)(TP*, Op*))foop, this); - - for(int i = 0; i < 100; i++) { - Op *o = new Op(i); - t->put_op(o); - } - - sleep(1); - - delete(t); - - return 0; - } -}; - -void foop(class TP *t, class Op *o) { - t->foo(o); -} - -int main(int argc, char *argv) { - TP t; - - t.main(argc,argv); -} - diff --git a/branches/sage/pgs/osdc/Blinker.h b/branches/sage/pgs/osdc/Blinker.h deleted file mode 100644 index e59c9629725ce..0000000000000 --- a/branches/sage/pgs/osdc/Blinker.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/branches/sage/pgs/osdc/Filer.cc b/branches/sage/pgs/osdc/Filer.cc deleted file mode 100644 index 85f09aa0e99b4..0000000000000 --- a/branches/sage/pgs/osdc/Filer.cc +++ /dev/null @@ -1,236 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = inode.layout.period(); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << endl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, p->layout, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << endl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << endl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << endl; - off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << endl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << endl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << endl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.object_size >= inode.layout.stripe_unit); - off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_unit; - dout(20) << " stripes_per_object " << stripes_per_object << endl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.stripe_unit; // which block - off_t stripeno = blockno / inode.layout.stripe_count; // which horizontal stripe (Y) - off_t stripepos = blockno % inode.layout.stripe_count; // which object in the object set (X) - off_t objectsetno = stripeno / stripes_per_object; // which object set - off_t objectno = objectsetno * inode.layout.stripe_count + stripepos; // object id - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->rev = rev; - ex->layout = objecter->osdmap->file_to_object_layout( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_unit; - off_t block_off = cur % inode.layout.stripe_unit; - off_t max = inode.layout.stripe_unit - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->layout << endl; - //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/branches/sage/pgs/osdc/Filer.h b/branches/sage/pgs/osdc/Filer.h deleted file mode 100644 index 0679a9b6ffef3..0000000000000 --- a/branches/sage/pgs/osdc/Filer.h +++ /dev/null @@ -1,165 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - Objecter::OSDRead *prepare_read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return rd; - } - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = prepare_read(inode, offset, len, bl); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/branches/sage/pgs/osdc/Journaler.cc b/branches/sage/pgs/osdc/Journaler.cc deleted file mode 100644 index 788188c84aea4..0000000000000 --- a/branches/sage/pgs/osdc/Journaler.cc +++ /dev/null @@ -1,620 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << endl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = inode.layout.period(); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << endl; - return; - } - - dout(1) << "read_head" << endl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << endl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << endl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << endl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, - new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << endl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->finc("lsum", lat); - logger->inc("lnum"); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << endl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - size_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.stripe_unit; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl; - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::flush(Context *onsync) -{ - // all flushed and acked? - if (write_pos == ack_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - } else { - // flush - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "flush flushing " << flush_pos << "~" << len << endl; - - // submit write for anything pending - // flush _start_ pos to _finish_flush - filer.write(inode, flush_pos, len, write_buf, 0, - g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK - g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - } - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + g_conf.journaler_write_head_interval < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << endl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << endl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << endl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << endl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << endl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << endl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << endl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - size_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << endl; - return false; - } - - size_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << endl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % inode.layout.period(); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << endl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << endl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << endl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << endl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/branches/sage/pgs/osdc/Journaler.h b/branches/sage/pgs/osdc/Journaler.h deleted file mode 100644 index 094f740054d69..0000000000000 --- a/branches/sage/pgs/osdc/Journaler.h +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.object_size*inode.layout.stripe_count; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/branches/sage/pgs/osdc/ObjectCacher.cc b/branches/sage/pgs/osdc/ObjectCacher.cc deleted file mode 100644 index f8d7d970c453e..0000000000000 --- a/branches/sage/pgs/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1557 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off) -{ - dout(20) << "split " << *bh << " at " << off << endl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = bh->last_write_tid; - right->set_state(bh->get_state()); - - off_t newleftlen = off - bh->start(); - right->set_start( off ); - right->set_length( bh->length() - newleftlen ); - - // shorten left - oc->bh_stat_sub(bh); - bh->set_length( newleftlen ); - oc->bh_stat_add(bh); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(bh->bl); - if (bl.length()) { - assert(bl.length() == (bh->length() + right->length())); - right->bl.substr_of(bl, bh->length(), right->length()); - bh->bl.substr_of(bl, 0, bh->length()); - } - - // move read waiters - if (!bh->waitfor_read.empty()) { - map >::iterator o, p = bh->waitfor_read.end(); - p--; - while (p != bh->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - bh->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *bh << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << endl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << endl; -} - - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << endl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << endl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - /*if (bh->is_dirty()) { - // already dirty, let's use it. - final = bh; - } else { - */ - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - /*if (bh->is_dirty()) { - // already dirty, use it. - } - else*/ - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final,bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << endl; - - return final; -} - - -void ObjectCacher::Object::truncate(off_t s) -{ - dout(10) << "truncate to " << s << endl; - - while (!data.empty()) { - BufferHead *bh = data.rbegin()->second; - if (bh->end() <= s) - break; - - // split bh at truncation point? - if (bh->start() < s) { - split(bh, s); - continue; - } - - // remove bh entirely - assert(bh->start() >= s); - oc->bh_remove(this, bh); - delete bh; - } -} - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << endl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << endl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << endl; - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << endl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << endl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << endl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << endl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << endl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << endl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << endl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << endl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << endl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << endl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << endl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << endl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << endl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << endl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << endl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << endl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << endl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << endl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << endl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << endl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << endl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - stripe_map[f_it->first].substr_of(bh->bl, - opos - bh->start(), - len); - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << endl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - } - dout(10) << "readx result is " << rd->bl->length() << endl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - bh->bl.claim_append(frag); - opos += f_it->second; - } - - // it's dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - // recombine with left? - map::iterator p = o->data.find(bh->start()); - if (p != o->data.begin()) { - p--; - if (p->second->is_dirty()) { - o->merge_left(p->second,bh); - bh = p->second; - } - } - // right? - p = o->data.find(bh->start()); - p++; - if (p != o->data.end() && - p->second->is_dirty()) - o->merge_left(p->second,bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() > g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting" << endl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << endl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << endl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << endl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << endl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << endl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << endl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << endl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << endl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino, wr->extents.front().layout); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << endl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << endl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << endl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), o->get_layout(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << endl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << endl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << endl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (!bh->is_clean()) - dout(0) << "purge forcibly removing " << *ob << " " << *bh << endl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "flush_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << endl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "commit_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << endl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << endl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << endl; - return; - } - - dout(10) << "purge_set " << ino << endl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << endl; - return 0; - } - - dout(10) << "release_set " << ino << endl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << endl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << endl; - } - - return unclean; -} - -void ObjectCacher::truncate_set(inodeno_t ino, list& exls) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "truncate_set on " << ino << " dne" << endl; - return; - } - - dout(10) << "truncate_set " << ino << endl; - - for (list::iterator p = exls.begin(); - p != exls.end(); - ++p) { - ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; - - // purge or truncate? - if (ex.start == 0) { - dout(10) << "truncate_set purging " << *ob << endl; - purge(ob); - } else { - // hrm, truncate object - dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << endl; - ob->truncate(ex.start); - - if (ob->can_close()) { - dout(10) << "truncate_set trimming " << *ob << endl; - close_object(ob); - } - } - } -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/branches/sage/pgs/osdc/ObjectCacher.h b/branches/sage/pgs/osdc/ObjectCacher.h deleted file mode 100644 index 15109ab782167..0000000000000 --- a/branches/sage/pgs/osdc/ObjectCacher.h +++ /dev/null @@ -1,564 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - ObjectLayout layout; - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i, ObjectLayout& l) : - oc(_oc), - oid(o), ino(i), layout(l), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - ObjectLayout& get_layout() { return layout; } - void set_layout(ObjectLayout& l) { layout = l; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void merge_right(BufferHead *left, BufferHead *right); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - void truncate(off_t s); - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino, ObjectLayout &l) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino, l); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void truncate_set(inodeno_t ino, list& ex); - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/branches/sage/pgs/osdc/Objecter.cc b/branches/sage/pgs/osdc/Objecter.cc deleted file mode 100644 index 64d2374b5bd99..0000000000000 --- a/branches/sage/pgs/osdc/Objecter.cc +++ /dev/null @@ -1,852 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << endl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << endl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << endl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << endl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << endl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << endl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << endl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << endl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl; - } else { - dout(0) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << endl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(0) << "kick_requests missing ack, resub write " << tid << endl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(0) << "kick_requests resub read " << tid << endl; - - // resubmit - readx_submit(rd, rd->ops[tid], true); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(0) << "kick_requests resub stat " << tid << endl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish, - objectrev_t rev) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().layout = ol; - st->extents.front().rev = rev; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " " << ex.layout - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << endl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish, - objectrev_t rev) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().layout = ol; - rd->extents.front().rev = rev; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex, bool retry) -{ - // find OSD - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " " << ex.layout - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_retry_attempt(retry); - - int who = pg.acker(); - if (rd->balance_reads) { - int replica = messenger->get_myname().num() % pg.acting.size(); - who = pg.acting[replica]; - dout(-10) << "readx_submit reading from random replica " << replica - << " = osd" << who << endl; - } - messenger->send_message(m, osdmap->get_inst(who)); - } - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << endl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - readx_submit(rd, rd->ops[tid], true); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << endl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << endl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << endl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << endl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << endl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << endl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().layout = ol; - wr->extents.front().buffer_extents[0] = len; - wr->extents.front().rev = rev; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().layout = ol; - z->extents.front().rev = rev; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().layout = ol; - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - assert(client_inc >= 0); - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " " << ex.layout - << " osd" << pg.primary() - << endl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.layout, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_rev(ex.rev); - if (usetid > 0) - m->set_retry_attempt(true); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) { - bufferlist thisbit; - thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); - cur.claim_append(thisbit); - } - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << endl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << endl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << endl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << endl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << endl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << endl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; - delete m; - } -} diff --git a/branches/sage/pgs/osdc/Objecter.h b/branches/sage/pgs/osdc/Objecter.h deleted file mode 100644 index db8f30e5c8573..0000000000000 --- a/branches/sage/pgs/osdc/Objecter.h +++ /dev/null @@ -1,200 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - int balance_reads; // if non-zero, direct reads to a pseudo-random replica - - OSDRead(bufferlist *b) : bl(b), onfinish(0), balance_reads(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0) - {} - ~Objecter() { - // clean up op_* - // *** - } - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex, bool retry=false); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish, - objectrev_t rev=0); - tid_t write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish, - objectrev_t rev=0); - - tid_t lock(int op, object_t oid, ObjectLayout ol, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/branches/sage/pgs/script/add_header.pl b/branches/sage/pgs/script/add_header.pl deleted file mode 100755 index 023c06e455fd1..0000000000000 --- a/branches/sage/pgs/script/add_header.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; - -my $header = `cat doc/header.txt`; - -# strip existing header -my $new = $old; -if ($new =~ /^(.*)\* Ceph - scalable distributed file system/s) { - my ($a,@b) = split(/\*\/\n/, $new); - $new = join("*/\n",@b); -} -$new = $header . $new; - -if ($new ne $old) { - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; - -} - diff --git a/branches/sage/pgs/script/adjusttabs.pl b/branches/sage/pgs/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/branches/sage/pgs/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/branches/sage/pgs/script/check_cache_dumps.pl b/branches/sage/pgs/script/check_cache_dumps.pl deleted file mode 100755 index 95bd28a474991..0000000000000 --- a/branches/sage/pgs/script/check_cache_dumps.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl - -my $epoch = shift || die "specify epoch"; - -my %auth; # mds -> id -> replica -> nonce -my %replica; # mds -> id -> auth -> nonce - -print "reading\n"; -for (my $i=0; -e "cachedump.$epoch.mds$i"; $i++) { - open(O,"cachedump.$epoch.mds$i"); - while () { - my ($name,$s); - ($name,$s) = /^\[(inode \d+) \S+ (\S+)/; - ($name,$s) = /^\[(dir \d+) \S+ (\S+)/ unless $name; - ($name,$s) = /^\[dentry (\S+) (\S+)/ unless $name; - if ($name) { - if ($s =~ /^auth/) { - $auth{$i}->{$name} = {}; - my ($rl) = $s =~ /\{(.*)\}/; - for my $r (split(/,/,$rl)) { - my ($who,$nonce) = $r =~ /(\d+)\=(\d+)/; - $auth{$i}->{$name}->{$who} = $nonce; - #print "auth $name rep by $who $nonce $s\n"; - } - } - else { - my ($a,$b,$n) = $s =~ /rep@(\d+)\,([\-\d]+)\.(\d+)/; - die $_ unless $a >= 0; - $replica{$i}->{$name}->{$a} = $n; - if ($b >= 0) { - $replica{$i}->{$name}->{$b} = $n; - } - } - } - } -} - -print "verifying replicas\n"; -for my $mds (keys %replica) { - for my $name (keys %{$replica{$mds}}) { - for my $auth (keys %{$replica{$mds}->{$name}}) { - if ($auth{$auth}->{$name}->{$mds}) { - if ($auth{$auth}->{$name}->{$mds} < $replica{$mds}->{$name}->{$auth}) { - print "problem: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } else { - print "ok: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } - } else { - print "??: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has no nonce\n"; - } - - } - } -} - - diff --git a/branches/sage/pgs/script/clean_osd_cow.sh b/branches/sage/pgs/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/branches/sage/pgs/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/branches/sage/pgs/script/clean_trace.pl b/branches/sage/pgs/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/branches/sage/pgs/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/branches/sage/pgs/script/comb.pl b/branches/sage/pgs/script/comb.pl deleted file mode 100755 index 88a4bb72a7970..0000000000000 --- a/branches/sage/pgs/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=(\d+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/branches/sage/pgs/script/find_auth_pins.pl b/branches/sage/pgs/script/find_auth_pins.pl deleted file mode 100755 index d37fb109a48da..0000000000000 --- a/branches/sage/pgs/script/find_auth_pins.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - elsif (/auth_pin / && !/waiting/) { - #my ($what) = /\[(\w+ \w+) /; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - #print "$_ add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - elsif (/auth_unpin/) { - #my ($what) = /\[(\w+ \w+) /;# / on (.*\])/; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/pgs/script/find_bufferleaks.pl b/branches/sage/pgs/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/branches/sage/pgs/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/branches/sage/pgs/script/find_lost_bdev_ops.pl b/branches/sage/pgs/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/branches/sage/pgs/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/branches/sage/pgs/script/find_lost_commit.pl b/branches/sage/pgs/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/branches/sage/pgs/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/branches/sage/pgs/script/find_lost_objecter.pl b/branches/sage/pgs/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/branches/sage/pgs/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/branches/sage/pgs/script/find_pathpins.pl b/branches/sage/pgs/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/branches/sage/pgs/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/pgs/script/find_requests.pl b/branches/sage/pgs/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/branches/sage/pgs/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/pgs/script/find_waiters.pl b/branches/sage/pgs/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/branches/sage/pgs/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/pgs/script/fix_modeline.pl b/branches/sage/pgs/script/fix_modeline.pl deleted file mode 100755 index 8eadde9b54e56..0000000000000 --- a/branches/sage/pgs/script/fix_modeline.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; -my $header = `cat doc/modeline.txt`; - -# strip existing modeline -my $new = $old; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; - -# add correct header -$new = $header . $new; - -if ($new ne $old) { - print "$fn\n"; - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; -} - diff --git a/branches/sage/pgs/script/grepblock b/branches/sage/pgs/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/branches/sage/pgs/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/branches/sage/pgs/script/merge_trace_rw.pl b/branches/sage/pgs/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/branches/sage/pgs/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/branches/sage/pgs/script/profonly.pl b/branches/sage/pgs/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/branches/sage/pgs/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/branches/sage/pgs/script/runset.pl b/branches/sage/pgs/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/branches/sage/pgs/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/pgs/script/sum.pl b/branches/sage/pgs/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/branches/sage/pgs/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/branches/sage/pgs/test/fakemds.cc b/branches/sage/pgs/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/branches/sage/pgs/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/branches/sage/pgs/test/makedirs.cc b/branches/sage/pgs/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/branches/sage/pgs/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/branches/sage/pgs/test/rushconfig b/branches/sage/pgs/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/branches/sage/pgs/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/branches/sage/pgs/test/rushtest.cc b/branches/sage/pgs/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/branches/sage/pgs/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/pgs/test/rushtest.cc~ b/branches/sage/pgs/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/branches/sage/pgs/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/pgs/test/testbucket.cc b/branches/sage/pgs/test/testbucket.cc deleted file mode 100644 index d8676da18faba..0000000000000 --- a/branches/sage/pgs/test/testbucket.cc +++ /dev/null @@ -1,67 +0,0 @@ - - -#include "../crush/Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/branches/sage/pgs/test/testcrush.cc b/branches/sage/pgs/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/branches/sage/pgs/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/branches/sage/pgs/test/testnewbuffers.cc b/branches/sage/pgs/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/branches/sage/pgs/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/branches/sage/pgs/test/testos.cc b/branches/sage/pgs/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/branches/sage/pgs/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/branches/sage/pgs/test/testosbdb.cc b/branches/sage/pgs/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/branches/sage/pgs/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/branches/sage/pgs/test/testtree.cc b/branches/sage/pgs/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/branches/sage/pgs/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/branches/sage/pgs/valgrind.supp b/branches/sage/pgs/valgrind.supp deleted file mode 100644 index a6154be057544..0000000000000 --- a/branches/sage/pgs/valgrind.supp +++ /dev/null @@ -1,25 +0,0 @@ -# some valgrind suppressions -# to load these automagically, -# cat > ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} -- 2.39.5