From c80eca7fcec49a21a3da9adc3118ab0d70563165 Mon Sep 17 00:00:00 2001
From: sageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Date: Fri, 19 Jan 2007 19:48:59 +0000
Subject: [PATCH] sage mds branch

git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1019 29311d96-e01e-0410-9327-a35deaab8ce9
---
 branches/sage/cephmds2/COPYING                |  504 +++
 branches/sage/cephmds2/Makefile               |  230 ++
 branches/sage/cephmds2/README                 |   53 +
 branches/sage/cephmds2/TODO                   |  307 ++
 branches/sage/cephmds2/cfuse.cc               |   91 +
 branches/sage/cephmds2/client/Client.cc       | 2614 ++++++++++++
 branches/sage/cephmds2/client/Client.h        |  588 +++
 branches/sage/cephmds2/client/FileCache.cc    |  171 +
 branches/sage/cephmds2/client/FileCache.h     |   65 +
 .../sage/cephmds2/client/SyntheticClient.cc   | 1226 ++++++
 .../sage/cephmds2/client/SyntheticClient.h    |  198 +
 branches/sage/cephmds2/client/Trace.cc        |  125 +
 branches/sage/cephmds2/client/Trace.h         |   75 +
 branches/sage/cephmds2/client/fuse.cc         |  276 ++
 branches/sage/cephmds2/client/fuse.h          |   23 +
 branches/sage/cephmds2/client/ldceph.cc       |  297 ++
 branches/sage/cephmds2/client/msgthread.h     |   25 +
 branches/sage/cephmds2/common/Clock.cc        |   19 +
 branches/sage/cephmds2/common/Clock.h         |  197 +
 branches/sage/cephmds2/common/Cond.h          |  118 +
 branches/sage/cephmds2/common/DecayCounter.h  |   94 +
 branches/sage/cephmds2/common/LogType.h       |  119 +
 branches/sage/cephmds2/common/Logger.cc       |  206 +
 branches/sage/cephmds2/common/Logger.h        |   74 +
 branches/sage/cephmds2/common/Mutex.h         |   68 +
 branches/sage/cephmds2/common/Semaphore.h     |   52 +
 branches/sage/cephmds2/common/Thread.h        |   60 +
 branches/sage/cephmds2/common/ThreadPool.h    |  138 +
 branches/sage/cephmds2/common/Timer.cc        |  220 ++
 branches/sage/cephmds2/common/Timer.h         |  143 +
 branches/sage/cephmds2/config.cc              |  718 ++++
 branches/sage/cephmds2/config.h               |  297 ++
 branches/sage/cephmds2/cosd.cc                |  118 +
 branches/sage/cephmds2/crush/BinaryTree.h     |  271 ++
 branches/sage/cephmds2/crush/Bucket.h         |  618 +++
 branches/sage/cephmds2/crush/Hash.h           |  287 ++
 branches/sage/cephmds2/crush/crush.h          |  521 +++
 .../cephmds2/crush/test/bucket_movement.cc    |  166 +
 .../cephmds2/crush/test/bucket_variance.cc    |  199 +
 .../cephmds2/crush/test/cluster_movement.cc   |  217 +
 .../crush/test/cluster_movement_remove.cc     |  229 ++
 .../crush/test/cluster_movement_rush.cc       |  218 +
 .../cephmds2/crush/test/creeping_failure.cc   |  276 ++
 .../crush/test/creeping_failure_variance.cc   |  281 ++
 .../cephmds2/crush/test/depth_variance.cc     |  185 +
 branches/sage/cephmds2/crush/test/mixed.cc    |  300 ++
 branches/sage/cephmds2/crush/test/movement.cc |  223 ++
 .../cephmds2/crush/test/movement_failed.cc    |  246 ++
 branches/sage/cephmds2/crush/test/overload.cc |  335 ++
 .../cephmds2/crush/test/overload_variance.cc  |  281 ++
 branches/sage/cephmds2/crush/test/sizes.cc    |  131 +
 .../sage/cephmds2/crush/test/smallbucket.cc   |  138 +
 .../sage/cephmds2/crush/test/speed_bucket.cc  |   86 +
 .../sage/cephmds2/crush/test/speed_depth.cc   |  174 +
 .../sage/cephmds2/crush/test/speed_rush.cc    |  145 +
 branches/sage/cephmds2/crush/test/t.cc        |   25 +
 .../sage/cephmds2/crush/test/testbucket.cc    |   61 +
 .../sage/cephmds2/crush/test/testnormal.cc    |   51 +
 branches/sage/cephmds2/doc/Commitdir.txt      |   22 +
 branches/sage/cephmds2/doc/Replication.txt    |   19 +
 branches/sage/cephmds2/doc/caching.txt        |  200 +
 branches/sage/cephmds2/doc/dentries.txt       |    4 +
 branches/sage/cephmds2/doc/file_modes.txt     |   66 +
 branches/sage/cephmds2/doc/header.txt         |   12 +
 branches/sage/cephmds2/doc/inos.txt           |   11 +
 branches/sage/cephmds2/doc/journal.txt        |  108 +
 branches/sage/cephmds2/doc/lazy_posix.txt     |   53 +
 branches/sage/cephmds2/doc/osd_outline.txt    |   37 +
 .../sage/cephmds2/doc/osd_replication.txt     |  226 ++
 branches/sage/cephmds2/doc/performance.txt    |   36 +
 .../cephmds2/doc/shared_write_states_nogo.txt |   39 +
 branches/sage/cephmds2/doc/shutdown.txt       |   13 +
 branches/sage/cephmds2/ebofs/Allocator.cc     |  692 ++++
 branches/sage/cephmds2/ebofs/Allocator.h      |   85 +
 branches/sage/cephmds2/ebofs/BlockDevice.cc   |  769 ++++
 branches/sage/cephmds2/ebofs/BlockDevice.h    |  331 ++
 branches/sage/cephmds2/ebofs/BufferCache.cc   | 1045 +++++
 branches/sage/cephmds2/ebofs/BufferCache.h    |  681 ++++
 branches/sage/cephmds2/ebofs/Cnode.h          |  100 +
 branches/sage/cephmds2/ebofs/Ebofs.cc         | 3169 +++++++++++++++
 branches/sage/cephmds2/ebofs/Ebofs.h          |  323 ++
 branches/sage/cephmds2/ebofs/Onode.h          |  390 ++
 branches/sage/cephmds2/ebofs/Table.h          |  897 +++++
 branches/sage/cephmds2/ebofs/mkfs.ebofs.cc    |  299 ++
 branches/sage/cephmds2/ebofs/nodes.h          |  583 +++
 branches/sage/cephmds2/ebofs/test.ebofs.cc    |  224 ++
 branches/sage/cephmds2/ebofs/types.h          |  168 +
 branches/sage/cephmds2/fakefuse.cc            |  147 +
 branches/sage/cephmds2/fakemon.cc             |  178 +
 branches/sage/cephmds2/fakesyn.cc             |  176 +
 branches/sage/cephmds2/include/Context.h      |  119 +
 branches/sage/cephmds2/include/Distribution.h |   74 +
 branches/sage/cephmds2/include/buffer.h       |  999 +++++
 branches/sage/cephmds2/include/error.h        |   40 +
 branches/sage/cephmds2/include/filepath.h     |  206 +
 branches/sage/cephmds2/include/interval_set.h |  305 ++
 branches/sage/cephmds2/include/lru.h          |  321 ++
 branches/sage/cephmds2/include/object.h       |   91 +
 branches/sage/cephmds2/include/oldbuffer.h    |  357 ++
 .../sage/cephmds2/include/oldbufferlist.h     |  681 ++++
 branches/sage/cephmds2/include/rangeset.h     |  252 ++
 branches/sage/cephmds2/include/statlite.h     |   70 +
 branches/sage/cephmds2/include/types.h        |  537 +++
 branches/sage/cephmds2/include/uofs.h         |   50 +
 branches/sage/cephmds2/jobs/alc.tp            |   38 +
 branches/sage/cephmds2/jobs/alcdat/makedirs   |   45 +
 .../sage/cephmds2/jobs/alcdat/makedirs.big    |   45 +
 .../sage/cephmds2/jobs/alcdat/makedirs.tput   |   46 +
 .../cephmds2/jobs/alcdat/makefiles.shared     |   32 +
 branches/sage/cephmds2/jobs/alcdat/openshared |   32 +
 .../sage/cephmds2/jobs/alcdat/ossh.include    |   45 +
 .../cephmds2/jobs/alcdat/ossh.include.big     |   46 +
 branches/sage/cephmds2/jobs/alcdat/ossh.lib   |   45 +
 .../sage/cephmds2/jobs/alcdat/ossh.lib.big    |   46 +
 branches/sage/cephmds2/jobs/alcdat/striping   |   48 +
 branches/sage/cephmds2/jobs/mds/log_striping  |   36 +
 branches/sage/cephmds2/jobs/mds/makedir_lat   |   33 +
 branches/sage/cephmds2/jobs/mds/makedirs      |   40 +
 branches/sage/cephmds2/jobs/mds/opensshlib    |   44 +
 branches/sage/cephmds2/jobs/meta1             |   19 +
 branches/sage/cephmds2/jobs/meta1.proc.sh     |   14 +
 branches/sage/cephmds2/jobs/osd/ebofs         |   51 +
 branches/sage/cephmds2/jobs/osd/mds_log       |   43 +
 branches/sage/cephmds2/jobs/osd/osd_threads   |   33 +
 branches/sage/cephmds2/jobs/osd/striping      |   78 +
 branches/sage/cephmds2/jobs/osd/wr_lat2       |   44 +
 branches/sage/cephmds2/jobs/osd/write_sizes   |   60 +
 branches/sage/cephmds2/jobs/rados/map_dist    |   32 +
 branches/sage/cephmds2/jobs/rados/rep_lat     |   43 +
 branches/sage/cephmds2/jobs/rados/wr_sizes    |   50 +
 branches/sage/cephmds2/mds/Anchor.h           |   55 +
 branches/sage/cephmds2/mds/AnchorClient.cc    |  149 +
 branches/sage/cephmds2/mds/AnchorClient.h     |   55 +
 branches/sage/cephmds2/mds/AnchorTable.cc     |  347 ++
 branches/sage/cephmds2/mds/AnchorTable.h      |   82 +
 branches/sage/cephmds2/mds/CDentry.cc         |  141 +
 branches/sage/cephmds2/mds/CDentry.h          |  188 +
 branches/sage/cephmds2/mds/CDir.cc            |  914 +++++
 branches/sage/cephmds2/mds/CDir.h             |  706 ++++
 branches/sage/cephmds2/mds/CInode.cc          |  495 +++
 branches/sage/cephmds2/mds/CInode.h           |  757 ++++
 branches/sage/cephmds2/mds/Capability.h       |  214 +
 branches/sage/cephmds2/mds/ClientMap.h        |   74 +
 branches/sage/cephmds2/mds/IdAllocator.cc     |  188 +
 branches/sage/cephmds2/mds/IdAllocator.h      |   78 +
 branches/sage/cephmds2/mds/Lock.h             |  311 ++
 branches/sage/cephmds2/mds/Locker.cc          | 2286 +++++++++++
 branches/sage/cephmds2/mds/Locker.h           |  123 +
 branches/sage/cephmds2/mds/LogEvent.cc        |   86 +
 branches/sage/cephmds2/mds/LogEvent.h         |   97 +
 branches/sage/cephmds2/mds/MDBalancer.cc      |  902 +++++
 branches/sage/cephmds2/mds/MDBalancer.h       |  106 +
 branches/sage/cephmds2/mds/MDCache.cc         | 2580 ++++++++++++
 branches/sage/cephmds2/mds/MDCache.h          |  282 ++
 branches/sage/cephmds2/mds/MDLog.cc           |  371 ++
 branches/sage/cephmds2/mds/MDLog.h            |   91 +
 branches/sage/cephmds2/mds/MDS.cc             |  692 ++++
 branches/sage/cephmds2/mds/MDS.h              |  252 ++
 branches/sage/cephmds2/mds/MDSMap.h           |  103 +
 branches/sage/cephmds2/mds/MDStore.cc         |  786 ++++
 branches/sage/cephmds2/mds/MDStore.h          |   75 +
 branches/sage/cephmds2/mds/Migrator.cc        | 3192 +++++++++++++++
 branches/sage/cephmds2/mds/Migrator.h         |  199 +
 branches/sage/cephmds2/mds/OSDMonitor.cc      |  523 +++
 branches/sage/cephmds2/mds/OSDMonitor.h       |   85 +
 branches/sage/cephmds2/mds/Renamer.cc         |  915 +++++
 branches/sage/cephmds2/mds/Renamer.h          |   98 +
 branches/sage/cephmds2/mds/Server.cc          | 2151 ++++++++++
 branches/sage/cephmds2/mds/Server.h           |  144 +
 branches/sage/cephmds2/mds/events/EAlloc.h    |  110 +
 .../sage/cephmds2/mds/events/EDirUpdate.h     |   97 +
 .../sage/cephmds2/mds/events/EInodeUpdate.h   |   55 +
 branches/sage/cephmds2/mds/events/EMkdir.h    |   62 +
 branches/sage/cephmds2/mds/events/EMknod.h    |   60 +
 .../sage/cephmds2/mds/events/EPurgeFinish.h   |   49 +
 branches/sage/cephmds2/mds/events/EString.h   |   53 +
 branches/sage/cephmds2/mds/events/ETrace.h    |  119 +
 branches/sage/cephmds2/mds/events/EUnlink.h   |   64 +
 branches/sage/cephmds2/mds/journal.cc         |  345 ++
 branches/sage/cephmds2/mds/mdstypes.h         |  135 +
 branches/sage/cephmds2/mds/oldcachestuff.cc   |  944 +++++
 .../sage/cephmds2/messages/MAnchorReply.h     |   74 +
 .../sage/cephmds2/messages/MAnchorRequest.h   |   76 +
 .../sage/cephmds2/messages/MCacheExpire.h     |   95 +
 .../sage/cephmds2/messages/MClientFileCaps.h  |  102 +
 .../messages/MClientInodeAuthUpdate.h         |   46 +
 .../sage/cephmds2/messages/MClientMount.h     |   50 +
 .../sage/cephmds2/messages/MClientMountAck.h  |   59 +
 .../sage/cephmds2/messages/MClientReply.h     |  302 ++
 .../sage/cephmds2/messages/MClientRequest.h   |  201 +
 .../sage/cephmds2/messages/MDentryUnlink.h    |   45 +
 branches/sage/cephmds2/messages/MDirExpire.h  |   50 +
 .../sage/cephmds2/messages/MDirExpireReq.h    |   49 +
 branches/sage/cephmds2/messages/MDirUpdate.h  |   71 +
 branches/sage/cephmds2/messages/MDiscover.h   |   75 +
 .../sage/cephmds2/messages/MDiscoverReply.h   |  266 ++
 branches/sage/cephmds2/messages/MExportDir.h  |  102 +
 .../sage/cephmds2/messages/MExportDirAck.h    |   42 +
 .../cephmds2/messages/MExportDirDiscover.h    |   51 +
 .../cephmds2/messages/MExportDirDiscoverAck.h |   52 +
 .../sage/cephmds2/messages/MExportDirFinish.h |   43 +
 .../sage/cephmds2/messages/MExportDirNotify.h |  111 +
 .../cephmds2/messages/MExportDirNotifyAck.h   |   46 +
 .../sage/cephmds2/messages/MExportDirPrep.h   |  186 +
 .../cephmds2/messages/MExportDirPrepAck.h     |   44 +
 .../cephmds2/messages/MExportDirWarning.h     |   45 +
 branches/sage/cephmds2/messages/MFailure.h    |   49 +
 branches/sage/cephmds2/messages/MFailureAck.h |   42 +
 .../sage/cephmds2/messages/MGenericMessage.h  |   44 +
 branches/sage/cephmds2/messages/MHashDir.h    |   64 +
 branches/sage/cephmds2/messages/MHashDirAck.h |   42 +
 .../sage/cephmds2/messages/MHashDirDiscover.h |   52 +
 .../cephmds2/messages/MHashDirDiscoverAck.h   |   53 +
 .../sage/cephmds2/messages/MHashDirNotify.h   |   50 +
 .../sage/cephmds2/messages/MHashDirPrep.h     |   93 +
 .../sage/cephmds2/messages/MHashDirPrepAck.h  |   43 +
 .../sage/cephmds2/messages/MHashReaddir.h     |   44 +
 .../cephmds2/messages/MHashReaddirReply.h     |   80 +
 branches/sage/cephmds2/messages/MHeartbeat.h  |   81 +
 .../sage/cephmds2/messages/MInodeExpire.h     |   50 +
 .../sage/cephmds2/messages/MInodeFileCaps.h   |   55 +
 branches/sage/cephmds2/messages/MInodeLink.h  |   47 +
 .../sage/cephmds2/messages/MInodeLinkAck.h    |   47 +
 .../sage/cephmds2/messages/MInodeUnlink.h     |   47 +
 .../sage/cephmds2/messages/MInodeUnlinkAck.h  |   44 +
 .../sage/cephmds2/messages/MInodeUpdate.h     |   61 +
 branches/sage/cephmds2/messages/MLock.h       |  128 +
 branches/sage/cephmds2/messages/MMDSBoot.h    |   38 +
 branches/sage/cephmds2/messages/MMDSGetMap.h  |   38 +
 branches/sage/cephmds2/messages/MMDSMap.h     |   69 +
 .../sage/cephmds2/messages/MMonElectionAck.h  |   46 +
 .../cephmds2/messages/MMonElectionCollect.h   |   42 +
 .../cephmds2/messages/MMonElectionRefresh.h   |   51 +
 .../cephmds2/messages/MMonElectionStatus.h    |   50 +
 .../sage/cephmds2/messages/MMonOSDMapInfo.h   |   49 +
 .../sage/cephmds2/messages/MMonOSDMapLease.h  |   49 +
 .../cephmds2/messages/MMonOSDMapLeaseAck.h    |   44 +
 .../cephmds2/messages/MMonOSDMapUpdateAck.h   |   42 +
 .../messages/MMonOSDMapUpdateCommit.h         |   42 +
 .../messages/MMonOSDMapUpdatePrepare.h        |   52 +
 branches/sage/cephmds2/messages/MNSConnect.h  |   45 +
 .../sage/cephmds2/messages/MNSConnectAck.h    |   53 +
 branches/sage/cephmds2/messages/MNSFailure.h  |   52 +
 branches/sage/cephmds2/messages/MNSLookup.h   |   46 +
 .../sage/cephmds2/messages/MNSLookupReply.h   |   44 +
 branches/sage/cephmds2/messages/MNSRegister.h |   59 +
 .../sage/cephmds2/messages/MNSRegisterAck.h   |   53 +
 branches/sage/cephmds2/messages/MOSDBoot.h    |   43 +
 branches/sage/cephmds2/messages/MOSDFailure.h |   54 +
 branches/sage/cephmds2/messages/MOSDGetMap.h  |   45 +
 branches/sage/cephmds2/messages/MOSDIn.h      |   42 +
 branches/sage/cephmds2/messages/MOSDMap.h     |   69 +
 branches/sage/cephmds2/messages/MOSDOp.h      |  214 +
 branches/sage/cephmds2/messages/MOSDOpReply.h |  146 +
 branches/sage/cephmds2/messages/MOSDOut.h     |   42 +
 branches/sage/cephmds2/messages/MOSDPGLog.h   |   61 +
 .../sage/cephmds2/messages/MOSDPGNotify.h     |   54 +
 branches/sage/cephmds2/messages/MOSDPGPeer.h  |   57 +
 .../sage/cephmds2/messages/MOSDPGPeerAck.h    |   69 +
 .../cephmds2/messages/MOSDPGPeerRequest.h     |   50 +
 branches/sage/cephmds2/messages/MOSDPGQuery.h |   51 +
 .../sage/cephmds2/messages/MOSDPGRemove.h     |   51 +
 .../sage/cephmds2/messages/MOSDPGSummary.h    |   65 +
 .../sage/cephmds2/messages/MOSDPGUpdate.h     |   64 +
 branches/sage/cephmds2/messages/MOSDPing.h    |   50 +
 branches/sage/cephmds2/messages/MPing.h       |   41 +
 branches/sage/cephmds2/messages/MPingAck.h    |   40 +
 branches/sage/cephmds2/messages/MRename.h     |   80 +
 branches/sage/cephmds2/messages/MRenameAck.h  |   42 +
 .../sage/cephmds2/messages/MRenameNotify.h    |   80 +
 .../sage/cephmds2/messages/MRenameNotifyAck.h |   40 +
 branches/sage/cephmds2/messages/MRenamePrep.h |   85 +
 branches/sage/cephmds2/messages/MRenameReq.h  |   79 +
 .../sage/cephmds2/messages/MRenameWarning.h   |   40 +
 branches/sage/cephmds2/messages/MUnhashDir.h  |   42 +
 .../sage/cephmds2/messages/MUnhashDirAck.h    |   65 +
 .../sage/cephmds2/messages/MUnhashDirNotify.h |   50 +
 .../cephmds2/messages/MUnhashDirNotifyAck.h   |   42 +
 .../sage/cephmds2/messages/MUnhashDirPrep.h   |   42 +
 .../cephmds2/messages/MUnhashDirPrepAck.h     |   93 +
 branches/sage/cephmds2/mon/Elector.cc         |  227 ++
 branches/sage/cephmds2/mon/Elector.h          |  163 +
 branches/sage/cephmds2/mon/MDSMonitor.cc      |  158 +
 branches/sage/cephmds2/mon/MDSMonitor.h       |   69 +
 branches/sage/cephmds2/mon/MonMap.h           |   63 +
 branches/sage/cephmds2/mon/Monitor.cc         |  260 ++
 branches/sage/cephmds2/mon/Monitor.h          |  114 +
 branches/sage/cephmds2/mon/OSDMonitor.cc      |  869 ++++
 branches/sage/cephmds2/mon/OSDMonitor.h       |  108 +
 branches/sage/cephmds2/msg/Dispatcher.cc      |   27 +
 branches/sage/cephmds2/msg/Dispatcher.h       |   40 +
 branches/sage/cephmds2/msg/FakeMessenger.cc   |  379 ++
 branches/sage/cephmds2/msg/FakeMessenger.h    |   81 +
 branches/sage/cephmds2/msg/HostMonitor.cc     |  235 ++
 branches/sage/cephmds2/msg/HostMonitor.h      |   97 +
 branches/sage/cephmds2/msg/MPIMessenger.cc    |  608 +++
 branches/sage/cephmds2/msg/MPIMessenger.h     |   56 +
 branches/sage/cephmds2/msg/MTMessenger.cc     |  197 +
 branches/sage/cephmds2/msg/MTMessenger.h      |   50 +
 branches/sage/cephmds2/msg/Message.cc         |  442 +++
 branches/sage/cephmds2/msg/Message.h          |  463 +++
 branches/sage/cephmds2/msg/Messenger.cc       |   84 +
 branches/sage/cephmds2/msg/Messenger.h        |   92 +
 branches/sage/cephmds2/msg/NewMessenger.cc    | 1714 ++++++++
 branches/sage/cephmds2/msg/NewMessenger.h     |  305 ++
 branches/sage/cephmds2/msg/NewerMessenger.cc  | 1791 +++++++++
 branches/sage/cephmds2/msg/NewerMessenger.h   |  343 ++
 branches/sage/cephmds2/msg/RWLock.h           |   49 +
 branches/sage/cephmds2/msg/SerialMessenger.h  |   28 +
 branches/sage/cephmds2/msg/TCPDirectory.cc    |  178 +
 branches/sage/cephmds2/msg/TCPDirectory.h     |  110 +
 branches/sage/cephmds2/msg/TCPMessenger.cc    | 1454 +++++++
 branches/sage/cephmds2/msg/TCPMessenger.h     |  115 +
 branches/sage/cephmds2/msg/error.c            |   77 +
 branches/sage/cephmds2/msg/mpistarter.cc      |   62 +
 branches/sage/cephmds2/msg/new_mpistarter.cc  |   43 +
 branches/sage/cephmds2/msg/tcp.cc             |   87 +
 branches/sage/cephmds2/msg/tcp.h              |   37 +
 branches/sage/cephmds2/newsyn.cc              |  420 ++
 branches/sage/cephmds2/osd/Ager.cc            |  326 ++
 branches/sage/cephmds2/osd/Ager.h             |   42 +
 branches/sage/cephmds2/osd/BDBMap.h           |  136 +
 branches/sage/cephmds2/osd/Fake.h             |  249 ++
 branches/sage/cephmds2/osd/FakeStore.cc       |  364 ++
 branches/sage/cephmds2/osd/FakeStore.h        |   87 +
 .../cephmds2/osd/FakeStoreBDBCollections.h    |  168 +
 branches/sage/cephmds2/osd/OBFSStore.cc       |  244 ++
 branches/sage/cephmds2/osd/OBFSStore.h        |   56 +
 branches/sage/cephmds2/osd/OSD.cc             | 3498 +++++++++++++++++
 branches/sage/cephmds2/osd/OSD.h              |  272 ++
 branches/sage/cephmds2/osd/OSDMap.h           |  515 +++
 branches/sage/cephmds2/osd/ObjectStore.cc     |  149 +
 branches/sage/cephmds2/osd/ObjectStore.h      |  479 +++
 branches/sage/cephmds2/osd/PG.cc              | 1312 +++++++
 branches/sage/cephmds2/osd/PG.h               |  735 ++++
 branches/sage/cephmds2/osd/rush.cc            |  230 ++
 branches/sage/cephmds2/osd/rush.h             |   60 +
 branches/sage/cephmds2/osd/tp.cc              |   80 +
 branches/sage/cephmds2/osdc/Blinker.h         |   91 +
 branches/sage/cephmds2/osdc/Filer.cc          |  235 ++
 branches/sage/cephmds2/osdc/Filer.h           |  158 +
 branches/sage/cephmds2/osdc/Journaler.cc      |  601 +++
 branches/sage/cephmds2/osdc/Journaler.h       |  218 +
 branches/sage/cephmds2/osdc/ObjectCacher.cc   | 1472 +++++++
 branches/sage/cephmds2/osdc/ObjectCacher.h    |  547 +++
 branches/sage/cephmds2/osdc/Objecter.cc       |  831 ++++
 branches/sage/cephmds2/osdc/Objecter.h        |  191 +
 branches/sage/cephmds2/script/add_header.pl   |   29 +
 branches/sage/cephmds2/script/adjusttabs.pl   |   24 +
 .../sage/cephmds2/script/clean_osd_cow.sh     |    3 +
 branches/sage/cephmds2/script/clean_trace.pl  |    8 +
 branches/sage/cephmds2/script/comb.pl         |  113 +
 .../sage/cephmds2/script/find_auth_pins.pl    |   46 +
 .../sage/cephmds2/script/find_bufferleaks.pl  |   69 +
 .../cephmds2/script/find_lost_bdev_ops.pl     |   34 +
 .../sage/cephmds2/script/find_lost_commit.pl  |   38 +
 .../cephmds2/script/find_lost_objecter.pl     |   34 +
 .../sage/cephmds2/script/find_pathpins.pl     |   41 +
 .../sage/cephmds2/script/find_requests.pl     |   42 +
 branches/sage/cephmds2/script/find_waiters.pl |   46 +
 branches/sage/cephmds2/script/grepblock       |   15 +
 .../sage/cephmds2/script/merge_trace_rw.pl    |   42 +
 branches/sage/cephmds2/script/profonly.pl     |   12 +
 branches/sage/cephmds2/script/runset.pl       |  380 ++
 branches/sage/cephmds2/script/sum.pl          |  148 +
 branches/sage/cephmds2/tcpfuse.cc             |   80 +
 branches/sage/cephmds2/tcpsyn.cc              |  292 ++
 branches/sage/cephmds2/test/fakemds.cc        |  104 +
 branches/sage/cephmds2/test/gprof-helper.c    |  120 +
 branches/sage/cephmds2/test/makedirs.cc       |   38 +
 branches/sage/cephmds2/test/mpitest.cc        |  111 +
 branches/sage/cephmds2/test/mttest.cc         |  140 +
 branches/sage/cephmds2/test/rushconfig        |    7 +
 branches/sage/cephmds2/test/rushtest.cc       |   49 +
 branches/sage/cephmds2/test/rushtest.cc~      |   49 +
 branches/sage/cephmds2/test/testbucket.cc     |   67 +
 branches/sage/cephmds2/test/testbuffers.cc    |   40 +
 branches/sage/cephmds2/test/testcrush.cc      |  266 ++
 branches/sage/cephmds2/test/testfilepath.cc   |   22 +
 branches/sage/cephmds2/test/testmpi.cc        |   53 +
 branches/sage/cephmds2/test/testnewbuffers.cc |   91 +
 branches/sage/cephmds2/test/testtree.cc       |   46 +
 branches/sage/cephmds2/test/testxattr.cc      |   31 +
 383 files changed, 88774 insertions(+)
 create mode 100644 branches/sage/cephmds2/COPYING
 create mode 100644 branches/sage/cephmds2/Makefile
 create mode 100644 branches/sage/cephmds2/README
 create mode 100644 branches/sage/cephmds2/TODO
 create mode 100644 branches/sage/cephmds2/cfuse.cc
 create mode 100644 branches/sage/cephmds2/client/Client.cc
 create mode 100644 branches/sage/cephmds2/client/Client.h
 create mode 100644 branches/sage/cephmds2/client/FileCache.cc
 create mode 100644 branches/sage/cephmds2/client/FileCache.h
 create mode 100644 branches/sage/cephmds2/client/SyntheticClient.cc
 create mode 100644 branches/sage/cephmds2/client/SyntheticClient.h
 create mode 100644 branches/sage/cephmds2/client/Trace.cc
 create mode 100644 branches/sage/cephmds2/client/Trace.h
 create mode 100644 branches/sage/cephmds2/client/fuse.cc
 create mode 100644 branches/sage/cephmds2/client/fuse.h
 create mode 100644 branches/sage/cephmds2/client/ldceph.cc
 create mode 100644 branches/sage/cephmds2/client/msgthread.h
 create mode 100644 branches/sage/cephmds2/common/Clock.cc
 create mode 100644 branches/sage/cephmds2/common/Clock.h
 create mode 100644 branches/sage/cephmds2/common/Cond.h
 create mode 100644 branches/sage/cephmds2/common/DecayCounter.h
 create mode 100644 branches/sage/cephmds2/common/LogType.h
 create mode 100644 branches/sage/cephmds2/common/Logger.cc
 create mode 100644 branches/sage/cephmds2/common/Logger.h
 create mode 100755 branches/sage/cephmds2/common/Mutex.h
 create mode 100644 branches/sage/cephmds2/common/Semaphore.h
 create mode 100644 branches/sage/cephmds2/common/Thread.h
 create mode 100644 branches/sage/cephmds2/common/ThreadPool.h
 create mode 100644 branches/sage/cephmds2/common/Timer.cc
 create mode 100644 branches/sage/cephmds2/common/Timer.h
 create mode 100644 branches/sage/cephmds2/config.cc
 create mode 100644 branches/sage/cephmds2/config.h
 create mode 100644 branches/sage/cephmds2/cosd.cc
 create mode 100644 branches/sage/cephmds2/crush/BinaryTree.h
 create mode 100644 branches/sage/cephmds2/crush/Bucket.h
 create mode 100644 branches/sage/cephmds2/crush/Hash.h
 create mode 100644 branches/sage/cephmds2/crush/crush.h
 create mode 100644 branches/sage/cephmds2/crush/test/bucket_movement.cc
 create mode 100644 branches/sage/cephmds2/crush/test/bucket_variance.cc
 create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement.cc
 create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement_remove.cc
 create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement_rush.cc
 create mode 100644 branches/sage/cephmds2/crush/test/creeping_failure.cc
 create mode 100644 branches/sage/cephmds2/crush/test/creeping_failure_variance.cc
 create mode 100644 branches/sage/cephmds2/crush/test/depth_variance.cc
 create mode 100644 branches/sage/cephmds2/crush/test/mixed.cc
 create mode 100644 branches/sage/cephmds2/crush/test/movement.cc
 create mode 100644 branches/sage/cephmds2/crush/test/movement_failed.cc
 create mode 100644 branches/sage/cephmds2/crush/test/overload.cc
 create mode 100644 branches/sage/cephmds2/crush/test/overload_variance.cc
 create mode 100644 branches/sage/cephmds2/crush/test/sizes.cc
 create mode 100644 branches/sage/cephmds2/crush/test/smallbucket.cc
 create mode 100644 branches/sage/cephmds2/crush/test/speed_bucket.cc
 create mode 100644 branches/sage/cephmds2/crush/test/speed_depth.cc
 create mode 100644 branches/sage/cephmds2/crush/test/speed_rush.cc
 create mode 100644 branches/sage/cephmds2/crush/test/t.cc
 create mode 100644 branches/sage/cephmds2/crush/test/testbucket.cc
 create mode 100644 branches/sage/cephmds2/crush/test/testnormal.cc
 create mode 100644 branches/sage/cephmds2/doc/Commitdir.txt
 create mode 100644 branches/sage/cephmds2/doc/Replication.txt
 create mode 100644 branches/sage/cephmds2/doc/caching.txt
 create mode 100644 branches/sage/cephmds2/doc/dentries.txt
 create mode 100644 branches/sage/cephmds2/doc/file_modes.txt
 create mode 100644 branches/sage/cephmds2/doc/header.txt
 create mode 100644 branches/sage/cephmds2/doc/inos.txt
 create mode 100644 branches/sage/cephmds2/doc/journal.txt
 create mode 100644 branches/sage/cephmds2/doc/lazy_posix.txt
 create mode 100644 branches/sage/cephmds2/doc/osd_outline.txt
 create mode 100644 branches/sage/cephmds2/doc/osd_replication.txt
 create mode 100644 branches/sage/cephmds2/doc/performance.txt
 create mode 100644 branches/sage/cephmds2/doc/shared_write_states_nogo.txt
 create mode 100644 branches/sage/cephmds2/doc/shutdown.txt
 create mode 100644 branches/sage/cephmds2/ebofs/Allocator.cc
 create mode 100644 branches/sage/cephmds2/ebofs/Allocator.h
 create mode 100644 branches/sage/cephmds2/ebofs/BlockDevice.cc
 create mode 100644 branches/sage/cephmds2/ebofs/BlockDevice.h
 create mode 100644 branches/sage/cephmds2/ebofs/BufferCache.cc
 create mode 100644 branches/sage/cephmds2/ebofs/BufferCache.h
 create mode 100644 branches/sage/cephmds2/ebofs/Cnode.h
 create mode 100644 branches/sage/cephmds2/ebofs/Ebofs.cc
 create mode 100644 branches/sage/cephmds2/ebofs/Ebofs.h
 create mode 100644 branches/sage/cephmds2/ebofs/Onode.h
 create mode 100644 branches/sage/cephmds2/ebofs/Table.h
 create mode 100644 branches/sage/cephmds2/ebofs/mkfs.ebofs.cc
 create mode 100644 branches/sage/cephmds2/ebofs/nodes.h
 create mode 100644 branches/sage/cephmds2/ebofs/test.ebofs.cc
 create mode 100644 branches/sage/cephmds2/ebofs/types.h
 create mode 100644 branches/sage/cephmds2/fakefuse.cc
 create mode 100644 branches/sage/cephmds2/fakemon.cc
 create mode 100644 branches/sage/cephmds2/fakesyn.cc
 create mode 100644 branches/sage/cephmds2/include/Context.h
 create mode 100644 branches/sage/cephmds2/include/Distribution.h
 create mode 100644 branches/sage/cephmds2/include/buffer.h
 create mode 100644 branches/sage/cephmds2/include/error.h
 create mode 100644 branches/sage/cephmds2/include/filepath.h
 create mode 100644 branches/sage/cephmds2/include/interval_set.h
 create mode 100644 branches/sage/cephmds2/include/lru.h
 create mode 100644 branches/sage/cephmds2/include/object.h
 create mode 100644 branches/sage/cephmds2/include/oldbuffer.h
 create mode 100644 branches/sage/cephmds2/include/oldbufferlist.h
 create mode 100644 branches/sage/cephmds2/include/rangeset.h
 create mode 100644 branches/sage/cephmds2/include/statlite.h
 create mode 100644 branches/sage/cephmds2/include/types.h
 create mode 100644 branches/sage/cephmds2/include/uofs.h
 create mode 100644 branches/sage/cephmds2/jobs/alc.tp
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs.big
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs.tput
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/makefiles.shared
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/openshared
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.include
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.include.big
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.lib
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.lib.big
 create mode 100644 branches/sage/cephmds2/jobs/alcdat/striping
 create mode 100644 branches/sage/cephmds2/jobs/mds/log_striping
 create mode 100644 branches/sage/cephmds2/jobs/mds/makedir_lat
 create mode 100644 branches/sage/cephmds2/jobs/mds/makedirs
 create mode 100644 branches/sage/cephmds2/jobs/mds/opensshlib
 create mode 100644 branches/sage/cephmds2/jobs/meta1
 create mode 100755 branches/sage/cephmds2/jobs/meta1.proc.sh
 create mode 100644 branches/sage/cephmds2/jobs/osd/ebofs
 create mode 100644 branches/sage/cephmds2/jobs/osd/mds_log
 create mode 100644 branches/sage/cephmds2/jobs/osd/osd_threads
 create mode 100644 branches/sage/cephmds2/jobs/osd/striping
 create mode 100644 branches/sage/cephmds2/jobs/osd/wr_lat2
 create mode 100644 branches/sage/cephmds2/jobs/osd/write_sizes
 create mode 100644 branches/sage/cephmds2/jobs/rados/map_dist
 create mode 100644 branches/sage/cephmds2/jobs/rados/rep_lat
 create mode 100644 branches/sage/cephmds2/jobs/rados/wr_sizes
 create mode 100644 branches/sage/cephmds2/mds/Anchor.h
 create mode 100644 branches/sage/cephmds2/mds/AnchorClient.cc
 create mode 100644 branches/sage/cephmds2/mds/AnchorClient.h
 create mode 100644 branches/sage/cephmds2/mds/AnchorTable.cc
 create mode 100644 branches/sage/cephmds2/mds/AnchorTable.h
 create mode 100644 branches/sage/cephmds2/mds/CDentry.cc
 create mode 100644 branches/sage/cephmds2/mds/CDentry.h
 create mode 100644 branches/sage/cephmds2/mds/CDir.cc
 create mode 100644 branches/sage/cephmds2/mds/CDir.h
 create mode 100644 branches/sage/cephmds2/mds/CInode.cc
 create mode 100644 branches/sage/cephmds2/mds/CInode.h
 create mode 100644 branches/sage/cephmds2/mds/Capability.h
 create mode 100644 branches/sage/cephmds2/mds/ClientMap.h
 create mode 100644 branches/sage/cephmds2/mds/IdAllocator.cc
 create mode 100644 branches/sage/cephmds2/mds/IdAllocator.h
 create mode 100644 branches/sage/cephmds2/mds/Lock.h
 create mode 100644 branches/sage/cephmds2/mds/Locker.cc
 create mode 100644 branches/sage/cephmds2/mds/Locker.h
 create mode 100644 branches/sage/cephmds2/mds/LogEvent.cc
 create mode 100644 branches/sage/cephmds2/mds/LogEvent.h
 create mode 100644 branches/sage/cephmds2/mds/MDBalancer.cc
 create mode 100644 branches/sage/cephmds2/mds/MDBalancer.h
 create mode 100644 branches/sage/cephmds2/mds/MDCache.cc
 create mode 100644 branches/sage/cephmds2/mds/MDCache.h
 create mode 100644 branches/sage/cephmds2/mds/MDLog.cc
 create mode 100644 branches/sage/cephmds2/mds/MDLog.h
 create mode 100644 branches/sage/cephmds2/mds/MDS.cc
 create mode 100644 branches/sage/cephmds2/mds/MDS.h
 create mode 100644 branches/sage/cephmds2/mds/MDSMap.h
 create mode 100644 branches/sage/cephmds2/mds/MDStore.cc
 create mode 100644 branches/sage/cephmds2/mds/MDStore.h
 create mode 100644 branches/sage/cephmds2/mds/Migrator.cc
 create mode 100644 branches/sage/cephmds2/mds/Migrator.h
 create mode 100644 branches/sage/cephmds2/mds/OSDMonitor.cc
 create mode 100644 branches/sage/cephmds2/mds/OSDMonitor.h
 create mode 100644 branches/sage/cephmds2/mds/Renamer.cc
 create mode 100644 branches/sage/cephmds2/mds/Renamer.h
 create mode 100644 branches/sage/cephmds2/mds/Server.cc
 create mode 100644 branches/sage/cephmds2/mds/Server.h
 create mode 100644 branches/sage/cephmds2/mds/events/EAlloc.h
 create mode 100644 branches/sage/cephmds2/mds/events/EDirUpdate.h
 create mode 100644 branches/sage/cephmds2/mds/events/EInodeUpdate.h
 create mode 100644 branches/sage/cephmds2/mds/events/EMkdir.h
 create mode 100644 branches/sage/cephmds2/mds/events/EMknod.h
 create mode 100644 branches/sage/cephmds2/mds/events/EPurgeFinish.h
 create mode 100644 branches/sage/cephmds2/mds/events/EString.h
 create mode 100644 branches/sage/cephmds2/mds/events/ETrace.h
 create mode 100644 branches/sage/cephmds2/mds/events/EUnlink.h
 create mode 100644 branches/sage/cephmds2/mds/journal.cc
 create mode 100644 branches/sage/cephmds2/mds/mdstypes.h
 create mode 100644 branches/sage/cephmds2/mds/oldcachestuff.cc
 create mode 100644 branches/sage/cephmds2/messages/MAnchorReply.h
 create mode 100644 branches/sage/cephmds2/messages/MAnchorRequest.h
 create mode 100644 branches/sage/cephmds2/messages/MCacheExpire.h
 create mode 100644 branches/sage/cephmds2/messages/MClientFileCaps.h
 create mode 100644 branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h
 create mode 100644 branches/sage/cephmds2/messages/MClientMount.h
 create mode 100644 branches/sage/cephmds2/messages/MClientMountAck.h
 create mode 100644 branches/sage/cephmds2/messages/MClientReply.h
 create mode 100644 branches/sage/cephmds2/messages/MClientRequest.h
 create mode 100644 branches/sage/cephmds2/messages/MDentryUnlink.h
 create mode 100644 branches/sage/cephmds2/messages/MDirExpire.h
 create mode 100644 branches/sage/cephmds2/messages/MDirExpireReq.h
 create mode 100644 branches/sage/cephmds2/messages/MDirUpdate.h
 create mode 100644 branches/sage/cephmds2/messages/MDiscover.h
 create mode 100644 branches/sage/cephmds2/messages/MDiscoverReply.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDir.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirAck.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirDiscover.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirDiscoverAck.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirFinish.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirNotify.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirNotifyAck.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirPrep.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirPrepAck.h
 create mode 100644 branches/sage/cephmds2/messages/MExportDirWarning.h
 create mode 100644 branches/sage/cephmds2/messages/MFailure.h
 create mode 100644 branches/sage/cephmds2/messages/MFailureAck.h
 create mode 100644 branches/sage/cephmds2/messages/MGenericMessage.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDir.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirAck.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirDiscover.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirDiscoverAck.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirNotify.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirPrep.h
 create mode 100644 branches/sage/cephmds2/messages/MHashDirPrepAck.h
 create mode 100644 branches/sage/cephmds2/messages/MHashReaddir.h
 create mode 100644 branches/sage/cephmds2/messages/MHashReaddirReply.h
 create mode 100644 branches/sage/cephmds2/messages/MHeartbeat.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeExpire.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeFileCaps.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeLink.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeLinkAck.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeUnlink.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeUnlinkAck.h
 create mode 100644 branches/sage/cephmds2/messages/MInodeUpdate.h
 create mode 100644 branches/sage/cephmds2/messages/MLock.h
 create mode 100644 branches/sage/cephmds2/messages/MMDSBoot.h
 create mode 100644 branches/sage/cephmds2/messages/MMDSGetMap.h
 create mode 100644 branches/sage/cephmds2/messages/MMDSMap.h
 create mode 100644 branches/sage/cephmds2/messages/MMonElectionAck.h
 create mode 100644 branches/sage/cephmds2/messages/MMonElectionCollect.h
 create mode 100644 branches/sage/cephmds2/messages/MMonElectionRefresh.h
 create mode 100644 branches/sage/cephmds2/messages/MMonElectionStatus.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapInfo.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapLease.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h
 create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h
 create mode 100644 branches/sage/cephmds2/messages/MNSConnect.h
 create mode 100644 branches/sage/cephmds2/messages/MNSConnectAck.h
 create mode 100644 branches/sage/cephmds2/messages/MNSFailure.h
 create mode 100644 branches/sage/cephmds2/messages/MNSLookup.h
 create mode 100644 branches/sage/cephmds2/messages/MNSLookupReply.h
 create mode 100644 branches/sage/cephmds2/messages/MNSRegister.h
 create mode 100644 branches/sage/cephmds2/messages/MNSRegisterAck.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDBoot.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDFailure.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDGetMap.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDIn.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDMap.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDOp.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDOpReply.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDOut.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGLog.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGNotify.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeer.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeerAck.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeerRequest.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGQuery.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGRemove.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGSummary.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPGUpdate.h
 create mode 100644 branches/sage/cephmds2/messages/MOSDPing.h
 create mode 100644 branches/sage/cephmds2/messages/MPing.h
 create mode 100644 branches/sage/cephmds2/messages/MPingAck.h
 create mode 100644 branches/sage/cephmds2/messages/MRename.h
 create mode 100644 branches/sage/cephmds2/messages/MRenameAck.h
 create mode 100644 branches/sage/cephmds2/messages/MRenameNotify.h
 create mode 100644 branches/sage/cephmds2/messages/MRenameNotifyAck.h
 create mode 100644 branches/sage/cephmds2/messages/MRenamePrep.h
 create mode 100644 branches/sage/cephmds2/messages/MRenameReq.h
 create mode 100644 branches/sage/cephmds2/messages/MRenameWarning.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDir.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDirAck.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDirNotify.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDirPrep.h
 create mode 100644 branches/sage/cephmds2/messages/MUnhashDirPrepAck.h
 create mode 100644 branches/sage/cephmds2/mon/Elector.cc
 create mode 100644 branches/sage/cephmds2/mon/Elector.h
 create mode 100644 branches/sage/cephmds2/mon/MDSMonitor.cc
 create mode 100644 branches/sage/cephmds2/mon/MDSMonitor.h
 create mode 100644 branches/sage/cephmds2/mon/MonMap.h
 create mode 100644 branches/sage/cephmds2/mon/Monitor.cc
 create mode 100644 branches/sage/cephmds2/mon/Monitor.h
 create mode 100644 branches/sage/cephmds2/mon/OSDMonitor.cc
 create mode 100644 branches/sage/cephmds2/mon/OSDMonitor.h
 create mode 100644 branches/sage/cephmds2/msg/Dispatcher.cc
 create mode 100644 branches/sage/cephmds2/msg/Dispatcher.h
 create mode 100644 branches/sage/cephmds2/msg/FakeMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/FakeMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/HostMonitor.cc
 create mode 100644 branches/sage/cephmds2/msg/HostMonitor.h
 create mode 100644 branches/sage/cephmds2/msg/MPIMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/MPIMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/MTMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/MTMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/Message.cc
 create mode 100644 branches/sage/cephmds2/msg/Message.h
 create mode 100644 branches/sage/cephmds2/msg/Messenger.cc
 create mode 100644 branches/sage/cephmds2/msg/Messenger.h
 create mode 100644 branches/sage/cephmds2/msg/NewMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/NewMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/NewerMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/NewerMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/RWLock.h
 create mode 100644 branches/sage/cephmds2/msg/SerialMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/TCPDirectory.cc
 create mode 100644 branches/sage/cephmds2/msg/TCPDirectory.h
 create mode 100644 branches/sage/cephmds2/msg/TCPMessenger.cc
 create mode 100644 branches/sage/cephmds2/msg/TCPMessenger.h
 create mode 100644 branches/sage/cephmds2/msg/error.c
 create mode 100644 branches/sage/cephmds2/msg/mpistarter.cc
 create mode 100644 branches/sage/cephmds2/msg/new_mpistarter.cc
 create mode 100644 branches/sage/cephmds2/msg/tcp.cc
 create mode 100644 branches/sage/cephmds2/msg/tcp.h
 create mode 100644 branches/sage/cephmds2/newsyn.cc
 create mode 100644 branches/sage/cephmds2/osd/Ager.cc
 create mode 100644 branches/sage/cephmds2/osd/Ager.h
 create mode 100644 branches/sage/cephmds2/osd/BDBMap.h
 create mode 100644 branches/sage/cephmds2/osd/Fake.h
 create mode 100644 branches/sage/cephmds2/osd/FakeStore.cc
 create mode 100644 branches/sage/cephmds2/osd/FakeStore.h
 create mode 100644 branches/sage/cephmds2/osd/FakeStoreBDBCollections.h
 create mode 100644 branches/sage/cephmds2/osd/OBFSStore.cc
 create mode 100644 branches/sage/cephmds2/osd/OBFSStore.h
 create mode 100644 branches/sage/cephmds2/osd/OSD.cc
 create mode 100644 branches/sage/cephmds2/osd/OSD.h
 create mode 100644 branches/sage/cephmds2/osd/OSDMap.h
 create mode 100644 branches/sage/cephmds2/osd/ObjectStore.cc
 create mode 100644 branches/sage/cephmds2/osd/ObjectStore.h
 create mode 100644 branches/sage/cephmds2/osd/PG.cc
 create mode 100644 branches/sage/cephmds2/osd/PG.h
 create mode 100644 branches/sage/cephmds2/osd/rush.cc
 create mode 100644 branches/sage/cephmds2/osd/rush.h
 create mode 100644 branches/sage/cephmds2/osd/tp.cc
 create mode 100644 branches/sage/cephmds2/osdc/Blinker.h
 create mode 100644 branches/sage/cephmds2/osdc/Filer.cc
 create mode 100644 branches/sage/cephmds2/osdc/Filer.h
 create mode 100644 branches/sage/cephmds2/osdc/Journaler.cc
 create mode 100644 branches/sage/cephmds2/osdc/Journaler.h
 create mode 100644 branches/sage/cephmds2/osdc/ObjectCacher.cc
 create mode 100644 branches/sage/cephmds2/osdc/ObjectCacher.h
 create mode 100644 branches/sage/cephmds2/osdc/Objecter.cc
 create mode 100644 branches/sage/cephmds2/osdc/Objecter.h
 create mode 100755 branches/sage/cephmds2/script/add_header.pl
 create mode 100755 branches/sage/cephmds2/script/adjusttabs.pl
 create mode 100755 branches/sage/cephmds2/script/clean_osd_cow.sh
 create mode 100755 branches/sage/cephmds2/script/clean_trace.pl
 create mode 100755 branches/sage/cephmds2/script/comb.pl
 create mode 100755 branches/sage/cephmds2/script/find_auth_pins.pl
 create mode 100755 branches/sage/cephmds2/script/find_bufferleaks.pl
 create mode 100755 branches/sage/cephmds2/script/find_lost_bdev_ops.pl
 create mode 100755 branches/sage/cephmds2/script/find_lost_commit.pl
 create mode 100755 branches/sage/cephmds2/script/find_lost_objecter.pl
 create mode 100755 branches/sage/cephmds2/script/find_pathpins.pl
 create mode 100755 branches/sage/cephmds2/script/find_requests.pl
 create mode 100755 branches/sage/cephmds2/script/find_waiters.pl
 create mode 100755 branches/sage/cephmds2/script/grepblock
 create mode 100644 branches/sage/cephmds2/script/merge_trace_rw.pl
 create mode 100755 branches/sage/cephmds2/script/profonly.pl
 create mode 100755 branches/sage/cephmds2/script/runset.pl
 create mode 100755 branches/sage/cephmds2/script/sum.pl
 create mode 100644 branches/sage/cephmds2/tcpfuse.cc
 create mode 100644 branches/sage/cephmds2/tcpsyn.cc
 create mode 100644 branches/sage/cephmds2/test/fakemds.cc
 create mode 100644 branches/sage/cephmds2/test/gprof-helper.c
 create mode 100644 branches/sage/cephmds2/test/makedirs.cc
 create mode 100644 branches/sage/cephmds2/test/mpitest.cc
 create mode 100644 branches/sage/cephmds2/test/mttest.cc
 create mode 100644 branches/sage/cephmds2/test/rushconfig
 create mode 100644 branches/sage/cephmds2/test/rushtest.cc
 create mode 100644 branches/sage/cephmds2/test/rushtest.cc~
 create mode 100644 branches/sage/cephmds2/test/testbucket.cc
 create mode 100644 branches/sage/cephmds2/test/testbuffers.cc
 create mode 100644 branches/sage/cephmds2/test/testcrush.cc
 create mode 100644 branches/sage/cephmds2/test/testfilepath.cc
 create mode 100644 branches/sage/cephmds2/test/testmpi.cc
 create mode 100644 branches/sage/cephmds2/test/testnewbuffers.cc
 create mode 100644 branches/sage/cephmds2/test/testtree.cc
 create mode 100644 branches/sage/cephmds2/test/testxattr.cc

diff --git a/branches/sage/cephmds2/COPYING b/branches/sage/cephmds2/COPYING
new file mode 100644
index 0000000000000..5ab7695ab8cab
--- /dev/null
+++ b/branches/sage/cephmds2/COPYING
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/branches/sage/cephmds2/Makefile b/branches/sage/cephmds2/Makefile
new file mode 100644
index 0000000000000..1681ac16698a8
--- /dev/null
+++ b/branches/sage/cephmds2/Makefile
@@ -0,0 +1,230 @@
+
+# mpicxx must be on your path; on googoo, this means that
+# /usr/local/mpich2-1.0.2/bin must be on your path.
+
+# For now, use g++ most of the time.
+# When compiling MPI stuff, specify myfile.cc instead of myfile.o so that ${MPICC} is 
+# invoked instead of the generic .o rule (or it'll use g++).
+# This makes it less annoying to build on non-mpi hosts for dev work, and seems to 
+# behave just fine...  change ${CC} back to mpicxx if you get paranoid.
+
+CC = g++
+CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+LIBS = -lpthread  
+
+#for normal mpich2 machines
+MPICC = mpicxx
+MPICFLAGS = ${CFLAGS}
+MPILIBS = ${LIBS}
+
+#for LLNL boxes without mpicxx
+#MPICC = g++
+#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib
+#MPILIBS = ${LIBS} -lelan -lmpi
+
+EBOFS_OBJS= \
+	ebofs/BlockDevice.o\
+	ebofs/BufferCache.o\
+	ebofs/Ebofs.o\
+	ebofs/Allocator.o
+
+MDS_OBJS= \
+	mds/MDS.o\
+	mds/journal.o\
+	mds/Server.o\
+	mds/MDCache.o\
+	mds/Locker.o\
+	mds/Migrator.o\
+	mds/Renamer.o\
+	mds/MDBalancer.o\
+	mds/CDentry.o\
+	mds/CDir.o\
+	mds/CInode.o\
+	mds/AnchorTable.o\
+	mds/AnchorClient.o\
+	mds/MDStore.o\
+	mds/LogEvent.o\
+	mds/IdAllocator.o\
+	mds/MDLog.o
+
+OSD_OBJS= \
+	osd/PG.o\
+	osd/Ager.o\
+	osd/FakeStore.o\
+	osd/OSD.o
+
+OSDC_OBJS= \
+	osdc/Objecter.o\
+	osdc/ObjectCacher.o\
+	osdc/Filer.o\
+	osdc/Journaler.o
+
+MON_OBJS= \
+	mon/Monitor.o\
+	mon/OSDMonitor.o\
+	mon/MDSMonitor.o\
+	mon/Elector.o
+
+COMMON_OBJS= \
+	msg/Messenger.o\
+	msg/Message.o\
+	msg/HostMonitor.o\
+	common/Logger.o\
+	common/Clock.o\
+	common/Timer.o\
+	config.o
+
+
+CLIENT_OBJS= \
+	client/FileCache.o\
+	client/Client.o\
+	client/SyntheticClient.o\
+	client/Trace.o
+
+TCP_OBJS = \
+	msg/TCPMessenger.o\
+	msg/TCPDirectory.o
+
+TARGETS = cosd cfuse newsyn fakesyn
+
+SRCS=*.cc */*.cc *.h */*.h */*/*.h
+
+all: depend ${TARGETS}
+
+test: depend ${TEST_TARGETS}
+
+obfs: depend obfstest
+
+
+# real bits
+cmon: cmon.cc mon.o ebofs.o msg/NewerMessenger.o common.o
+	${CC} ${CFLAGS} ${MPILIBS} $^ -o $@
+
+cosd: cosd.cc osd.o ebofs.o msg/NewerMessenger.o common.o
+	${CC} ${CFLAGS} ${MPILIBS} $^ -o $@
+
+cmds: cmds.cc mds.o osdc.o msg/NewerMessenger.o common.o
+	${CC} ${CFLAGS} ${MPILIBS} $^ -o $@
+
+cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/NewerMessenger.o common.o
+	${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+
+# misc
+gprof-helper.so: test/gprof-helper.c
+	gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl 
+
+
+
+# fuse
+fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o
+	${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
+	${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+mpifuse: mpifuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
+	${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+
+# synthetic workload
+fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
+	${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+tcpsyn: tcpsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o ${TCP_OBJS} common.o
+	${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o
+	${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o
+	${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+# + obfs
+fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o
+	${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@
+
+tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o
+	${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ 
+
+
+# ebofs
+
+mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o
+	${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o
+	${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+
+
+
+# libceph
+libceph.o: client/ldceph.o client/Client.o ${TCP_OBJS} ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
+	ld -i $^ -o $@
+
+bench/mdtest/mdtest.o: bench/mdtest/mdtest.c
+	mpicc -c $^ -o $@
+
+mdtest: bench/mdtest/mdtest.o
+	${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+mdtest.ceph: bench/mdtest/mdtest.o libceph.o
+	${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+#
+
+%.so: %.cc
+	${CC} -shared -fPIC ${CFLAGS} $< -o $@
+
+
+testmpi: test/testmpi.cc msg/MPIMessenger.cc config.o common/Timer.o common/clock.o msg/Messenger.o msg/Dispatcher.o msg/error.o
+	${MPICC} ${CFLAGS} ${LIBS} $^ -o $@
+
+
+clean:
+	rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS}
+
+common.o: ${COMMON_OBJS}
+	ld -i -o $@ $^
+
+ebofs.o: ${EBOFS_OBJS}
+	ld -i -o $@ $^
+
+client.o: ${CLIENT_OBJS} 
+	ld -i -o $@ $^
+
+osd.o: ${OSD_OBJS}
+	ld -i -o $@ $^
+
+osdc.o: ${OSDC_OBJS}
+	ld -i -o $@ $^
+
+osd_obfs.o: osd/OBFSStore.o osd/OSD.ccosd/PG.o osd/ObjectStore.o osd/FakeStore.o
+	${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a
+
+mds.o: ${MDS_OBJS}
+	ld -i -o $@ $^
+
+mon.o: ${MON_OBJS}
+	ld -i -o $@ $^
+
+%.o: %.cc
+	${CC} ${CFLAGS} -c $< -o $@
+
+%.po: %.cc
+	${CC} -fPIC ${CFLAGS} -c $< -o $@
+
+count:
+	cat ${SRCS} | wc -l
+	cat ${SRCS} | grep -c \;
+
+.depend:
+	touch .depend
+
+depend:
+	$(RM) .depend
+	makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null
+
+# now add a line to include the dependency list.
+include .depend
diff --git a/branches/sage/cephmds2/README b/branches/sage/cephmds2/README
new file mode 100644
index 0000000000000..97008e49ffe75
--- /dev/null
+++ b/branches/sage/cephmds2/README
@@ -0,0 +1,53 @@
+pmds = parallel metadata server/system
+
+'test' is a standalone proccess that runs all clients, OSDs, and MDSs
+in a single process with a basic message passer (FakeMessenger).
+Useful for debugging.
+
+'pmds' uses MPI for communication.
+
+'import' builds a metadata store on ./osddata/ by taking find output
+from stdin.  Make sure find is run from the current directory so that
+import can stat the files it's fed.  The find root becomes the file
+system root; feel free to use relative paths.
+
+This is all GPL, etc.
+
+
+Getting started:
+
+ 1- Comment out the LEAKTRACER= line in the Makefile if you don't have
+    LeakTracer installed (you probably don't).
+ 
+ 2- make (test and import targets are testing ones; pmds uses MPI)
+
+ 3- Build an OSD metadata store:
+      # mkdir osddata
+      # find /some/big/dir | ./import root
+
+ 4- Single proc sim:
+      # ./test
+    or more likely,
+      # ./test > out
+
+ 5- Change parameters in config.cc.
+
+ 6- If you want stats logged, mkdir log (make sure you have enough
+    file handles; there's one open file per client).
+
+
+Notes on pmds (MPI version):
+
+ - On mcr/alc I have to 
+     # setenv LD_LIBRARY_PATH /usr/lib/mpi/mpi_gnu/lib
+   for the GNU runtime MPI libs (otherwise you get the Intel ones,
+   which segfault).
+
+ - Each MDS and OSD gets its own node.  Clients are divided over
+   whatever is left over.  So make sure you tell MPI to give you at
+   least num_mds+num_osd+1 processes (num_mds etc defined in
+   config.cc).
+
+
+
+2004.08.25 sage@newdream.net
diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO
new file mode 100644
index 0000000000000..3c1e1f62b437c
--- /dev/null
+++ b/branches/sage/cephmds2/TODO
@@ -0,0 +1,307 @@
+
+- paxos for monitor
+- lnet?
+- crush
+ - xml import/export?
+ - crush tools
+
+== todo
+
+1- pipelining writes?
+2- intervening reads?
+
+inode ops
+ utime       -- no concurrency issues
+ chown/chmod -- should lock
+ truncate    -- should lock
+ 1-> no.  multiple process concurrency on a single inode is not important.
+ 2-> maybe... intervening stats?  probably not important.
+
+directory ops.  parent inode mtime, + dirent xlocks?
+ mknod
+ open+create
+ symlink
+ unlink
+ rmdir
+ rename
+ 1-> yes.  but mtime updates are independent (mtime monotonically increasing), so it's easy.
+ 2-> yes.  
+
+--> so, make let's make file/hard wrlock exclusive.
+
+locks
+ namespace
+  path pins -- read lock
+  dentry xlock -- write lock
+ inode
+  hard/file rd start/stop -- read lock
+  hard/file wr start/stop -- write lock
+ 
+
+
+
+- integrate revisions into ObjectCacher
+- clean up oid.rev vs op.rev in osd+osdc
+
+rados paper todo
+- better experiments
+- flush log only in response to subsequent read or write?
+- better behaving recovery
+- justify use of splay.
+  - dynamic replication
+- snapshots
+
+rados snapshots
+- attr.crev is rev we were created in.
+- oid.rev=0 is "live".  defined for attr.crev <= rev.
+- otherwise, defined for attr.crev <= rev < oid.rev  (i.e. oid.rev is upper bound, non-inclusive.)
+
+- write|delete is tagged with op.rev
+  - if attr.crev < op.rev
+    - we clone to oid.rev=rev (clone keeps old crev)
+    - change live attr.crev=rev.
+  - apply update
+- read is tagged with op.rev
+  - if 0, we read from 0 (if it exists).
+  - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev.
+
+- how to get usage feedback to monitor?
+
+- change messenger entity_inst_t
+ - no more rank!  make it a uniquish nonce?
+
+- clean up mds caps release in exporter
+- figure out client failure modes
+- clean up messenger failure modes.  
+- add connection retry.
+
+mds recovery
+- multiple passes?
+ 1- establish import/export map
+ ?- 
+ 2- replay inode, dir, dentry updates
+- single pass
+ - each event needs to embed inode for trace up to the import
+ - second stage will reconcile cached items with other active mds nodes
+ - cached items will be shared with the primary to repopulate it's non-dirty cache
+ - query clients for their state too?
+   - mds must journal list of clients with whom we share state?
+
+
+journaler
+- should we pad with zeros to avoid splitting individual entries?
+  - make it a g_conf flag?
+  - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes)
+- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes
+
+
+monitor
+?- monitor user lib that handles resending, redirection of mon requests.
+- elector
+/- organize monitor store
+
+osdmon
+- distribute
+- recovery: store elector epochs with maps..
+- monitor needs to monitor some osds...
+- monitor pgs, notify on out
+- watch osd utilization; adjust overload in cluster map
+
+mdsmon
+
+osd/rados
+- efficiently replicate clone() objects
+- pg_num instead of pg_bits
+- flag missing log entries on crash recovery  --> WRNOOP? or WRLOST?
+- consider implications of nvram writeahead logs
+- fix heartbeat wrt new replication
+- mark residual pgs obsolete  ???
+- rdlocks
+- optimize remove wrt recovery pushes
+- pg_bit/pg_num changes
+- report crashed pgs?
+
+messenger
+/- share same tcp socket for sender and receiver
+/- graceful connection teardown
+- close idle connections
+- generalize out a transport layer?  
+  - eg reliable tcp for most things, connectionless unreliable datagrams for monitors?
+  - or, aggressive connection closing on monitors?  or just max_connections and an lru?
+- osds: forget idle client addrs
+
+objecter
+
+objectcacher
+- ocacher caps transitions vs locks
+- test read locks
+
+reliability
+- heartbeat vs ping
+- osdmonitor, filter
+
+ebofs
+- verify proper behavior of conflicting/overlapping reads of clones
+- test(fix) sync() 
+- combine inodes and/or cnodes into same blocks
+- allow btree sets instead of maps
+- eliminate nodepools
+- nonblocking write on missing onodes?
+- fix bug in node rotation on insert (and reenable)
+- fix NEAR_LAST_FWD (?)
+- journaling? in NVRAM?
+- metadata in nvram?  flash?
+
+
+
+bugs/stability
+- figure out weird 40ms latency with double log entries
+
+
+general
+- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown
+- well, just figure out general timer cancellation strategy that avoids races
+ - use updated Timer as a model?
+
+
+remaining hard problems
+- how to cope with file size changes and read/write sharing
+- mds failure recovery (of course)
+
+
+crush
+- more efficient failure when all/too many osds are down
+- allow forcefeed for more complicated rule structures.  (e.g. make force_stack a list< set<int> >)
+
+
+mds
+- distributed client management
+- anchormgr
+  - 2pc
+  - independent journal
+  - distributed?
+- link count management
+  - also 2pc
+- chdir (directory opens!)
+- rewrite logstream
+  - clean up
+  - be smart about rados ack vs reread
+  - log locking?  root log object
+  - trimming, rotation
+
+- efficient stat for single writers
+- lstat vs stat
+- add FILE_CAP_EXTEND capability bit
+- only share osdmap updates with clients holding capabilities
+- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?)
+- finish hard links!
+ - reclaim danglers from inode file on discover...
+ - fix rename wrt hard links
+- interactive hash/unhash interface
+- test hashed readdir
+- make logstream.flush align itself to stripes
+
+- carefully define/document frozen wrt dir_auth vs hashing
+
+
+
+client
+- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache
+
+- test client caps with meta exports
+- some heuristic behavior to consolidate caps to inode auth
+- client will re-tx anything it needed to say upon rx of new mds notification (?)
+
+
+
+
+
+
+MDS TODO
+- fix hashed readdir: should (optionally) do a lock on dir namespace?
+- fix hard links
+  - they mostly work, but they're fragile
+- sync clients on stat
+  - will need to ditch 10s client metadata caching before this is useful
+  - implement truncate
+- implement hashed directories
+- statfs?
+- rewrite journal + recovery
+- figure out online failure recovery
+- more distributed fh management?
+- btree directories (for efficient large directories)
+- consistency points/snapshots
+
+- fix MExportAck and others to use dir+dentry, not inode
+  (otherwise this all breaks with hard links.. altho it probably needs reworking already?)
+
+
+
+
+
+why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times.
+- mds -> 1,2 : qsync
+- client1 writes at byte 100
+- client1 -> mds : qsync reply (size=100)
+- client1 writes at byte 300
+- client1 -> client2 (outside channel)
+- client2 writes at byte 200
+- client2 -> mds : qsync reply (size=200)
+-> stat results in size 200, even though at no single point in time was the max size 500.
+-> for correct result, need to _stop_ client writers while gathering metadata.
+
+
+SAGE:
+
+- string table?
+
+- hard links
+ - fix MExportAck and others to use dir+dentry, not inode
+   (otherwise this all breaks with hard links.. altho it probably needs reworking already!)
+
+- do real permission checks?
+
+
+
+CLIENT TODO
+
+- statfs
+
+
+
+
+
+ISSUES
+
+
+- discover
+ - soft: authority selectively repicates, or sets a 'forward' flag in reply
+ - hard: authority always replicates (eg. discover for export)
+ - forward flag (see soft)
+ - error flag   (if file not found, etc.)
+ - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply
+
+
+
+DOCUMENT
+- cache, distributed cache structure and invariants
+- export process
+- hash/unhash process
+
+
+TEST
+- hashing
+ - test hash/unhash operation
+ - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed.
+ - test nauthitems (wrt hashing?)
+
+
+IMPLEMENT
+
+- smarter balancing
+  - popularity calculation and management is inconsistent/wrong.
+  - does it work?
+
+- dump active config in run output somewhere
+
+
diff --git a/branches/sage/cephmds2/cfuse.cc b/branches/sage/cephmds2/cfuse.cc
new file mode 100644
index 0000000000000..b260c4bd3c3f8
--- /dev/null
+++ b/branches/sage/cephmds2/cfuse.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "msg/NewMessenger.h"
+
+#include "common/Timer.h"
+       
+#include <envz.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv, char *envp[]) {
+
+  //cerr << "cfuse starting " << myrank << "/" << world << endl;
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+  parse_config_options(args);
+
+  // args for fuse
+  vec_to_argv(args, argc, argv);
+
+  // load monmap
+  bufferlist bl;
+  int fd = ::open(".ceph_monmap", O_RDONLY);
+  assert(fd >= 0);
+  struct stat st;
+  ::fstat(fd, &st);
+  bufferptr bp(st.st_size);
+  bl.append(bp);
+  ::read(fd, (void*)bl.c_str(), bl.length());
+  ::close(fd);
+  
+  MonMap *monmap = new MonMap;
+  monmap->decode(bl);
+
+  // start up network
+  rank.set_namer(monmap->get_inst(0).addr);
+  rank.start_rank();
+
+  // start client
+  Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap);
+  client->init();
+    
+  // start up fuse
+  // use my argc, argv (make sure you pass a mount point!)
+  cout << "mounting" << endl;
+  client->mount();
+  
+  cerr << "starting fuse on pid " << getpid() << endl;
+  ceph_fuse_main(client, argc, argv);
+  cerr << "fuse finished on pid " << getpid() << endl;
+  
+  client->unmount();
+  cout << "unmounted" << endl;
+  client->shutdown();
+  
+  delete client;
+  
+  // wait for messenger to finish
+  rank.wait();
+  
+
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc
new file mode 100644
index 0000000000000..cb3cc2622bae4
--- /dev/null
+++ b/branches/sage/cephmds2/client/Client.cc
@@ -0,0 +1,2614 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+// unix-ey fs stuff
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <utime.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+#include <iostream>
+using namespace std;
+
+
+// ceph stuff
+#include "Client.h"
+
+
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MMDSGetMap.h"
+#include "messages/MMDSMap.h"
+
+#include "osdc/Filer.h"
+#include "osdc/Objecter.h"
+#include "osdc/ObjectCacher.h"
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Logger.h"
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " "
+
+#define  tout       if (g_conf.client_trace) cout << "trace: " 
+
+
+// static logger
+LogType client_logtype;
+Logger  *client_logger = 0;
+
+
+
+class C_Client_CloseRelease : public Context {
+  Client *cl;
+  Inode *in;
+public:
+  C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {}
+  void finish(int) {
+    cl->close_release(in);
+  }
+};
+
+class C_Client_CloseSafe : public Context {
+  Client *cl;
+  Inode *in;
+public:
+  C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {}
+  void finish(int) {
+    cl->close_safe(in);
+  }
+};
+
+
+
+
+
+
+// cons/des
+
+Client::Client(Messenger *m, MonMap *mm)
+{
+  // which client am i?
+  whoami = MSG_ADDR_NUM(m->get_myaddr());
+  monmap = mm;
+
+  mounted = false;
+  unmounting = false;
+
+  last_tid = 0;
+  unsafe_sync_write = 0;
+
+  mdsmap = 0;
+
+  // 
+  root = 0;
+
+  set_cache_size(g_conf.client_cache_size);
+
+  // file handles
+  free_fh_set.insert(10, 1<<30);
+
+  // set up messengers
+  messenger = m;
+  messenger->set_dispatcher(this);
+
+  // osd interfaces
+  osdmap = new OSDMap();     // initially blank.. see mount()
+  objecter = new Objecter(messenger, monmap, osdmap);
+  objectcacher = new ObjectCacher(objecter, client_lock);
+  filer = new Filer(objecter);
+}
+
+
+Client::~Client() 
+{
+  if (messenger) { delete messenger; messenger = 0; }
+  if (filer) { delete filer; filer = 0; }
+  if (objectcacher) { delete objectcacher; objectcacher = 0; }
+  if (objecter) { delete objecter; objecter = 0; }
+  if (osdmap) { delete osdmap; osdmap = 0; }
+
+  tear_down_cache();
+}
+
+
+void Client::tear_down_cache()
+{
+  // fh's
+  for (hash_map<fh_t, Fh*>::iterator it = fh_map.begin();
+       it != fh_map.end();
+       it++) {
+    Fh *fh = it->second;
+    dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl;
+    put_inode(fh->inode);
+    delete fh;
+  }
+  fh_map.clear();
+
+  // caps!
+  // *** FIXME ***
+
+  // empty lru
+  lru.lru_set_max(0);
+  trim_cache();
+  assert(lru.lru_get_size() == 0);
+
+  // close root ino
+  assert(inode_map.size() <= 1);
+  if (root && inode_map.size() == 1) {
+    delete root;
+    root = 0;
+    inode_map.clear();
+  }
+
+  assert(inode_map.empty());
+}
+
+
+
+// debug crapola
+
+void Client::dump_inode(Inode *in, set<Inode*>& did)
+{
+  dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl;
+
+  if (in->dir) {
+    dout(1) << "  dir size " << in->dir->dentries.size() << endl;
+    //for (hash_map<const char*, Dentry*, hash<const char*>, eqstr>::iterator it = in->dir->dentries.begin();
+    for (hash_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
+         it != in->dir->dentries.end();
+         it++) {
+      dout(1) << "    dn " << it->first << " ref " << it->second->ref << endl;
+      dump_inode(it->second->inode, did);
+    }
+  }
+}
+
+void Client::dump_cache()
+{
+  set<Inode*> did;
+
+  if (root) dump_inode(root, did);
+
+  for (hash_map<inodeno_t, Inode*>::iterator it = inode_map.begin();
+       it != inode_map.end();
+       it++) {
+    if (did.count(it->second)) continue;
+    
+    dout(1) << "dump_cache: inode " << it->first
+            << " ref " << it->second->ref 
+            << " dir " << it->second->dir << endl;
+    if (it->second->dir) {
+      dout(1) << "  dir size " << it->second->dir->dentries.size() << endl;
+    }
+  }
+ 
+}
+
+
+void Client::init() {
+  
+}
+
+void Client::shutdown() {
+  dout(1) << "shutdown" << endl;
+  messenger->shutdown();
+}
+
+
+
+
+// ===================
+// metadata cache stuff
+
+void Client::trim_cache()
+{
+  unsigned last = 0;
+  while (lru.lru_get_size() != last) {
+    last = lru.lru_get_size();
+
+    if (lru.lru_get_size() <= lru.lru_get_max())  break;
+
+    // trim!
+    Dentry *dn = (Dentry*)lru.lru_expire();
+    if (!dn) break;  // done
+    
+    //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl;
+    unlink(dn);
+  }
+
+  // hose root?
+  if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) {
+    delete root;
+    root = 0;
+    inode_map.clear();
+  }
+}
+
+/** insert_inode
+ *
+ * insert + link a single dentry + inode into the metadata cache.
+ */
+Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname)
+{
+  Dentry *dn = NULL;
+  if (dir->dentries.count(dname))
+    dn = dir->dentries[dname];
+
+  dout(12) << "insert_inode " << dname << " ino " << st->inode.ino 
+           << "  size " << st->inode.size
+           << "  mtime " << st->inode.mtime
+           << "  hashed " << st->hashed
+           << endl;
+  
+  if (dn) {
+    if (dn->inode->inode.ino == st->inode.ino) {
+      touch_dn(dn);
+      dout(12) << " had dentry " << dname
+               << " with correct ino " << dn->inode->inode.ino
+               << endl;
+    } else {
+      dout(12) << " had dentry " << dname
+               << " with WRONG ino " << dn->inode->inode.ino
+               << endl;
+      unlink(dn);
+      dn = NULL;
+    }
+  }
+  
+  if (!dn) {
+    // have inode linked elsewhere?  -> unlink and relink!
+    if (inode_map.count(st->inode.ino)) {
+      Inode *in = inode_map[st->inode.ino];
+      assert(in);
+
+      if (in->dn) {
+        dout(12) << " had ino " << in->inode.ino
+                 << " linked at wrong position, unlinking"
+                 << endl;
+        dn = relink(in->dn, dir, dname);
+      } else {
+        // link
+        dout(12) << " had ino " << in->inode.ino
+                 << " unlinked, linking" << endl;
+        dn = link(dir, dname, in);
+      }
+    }
+  }
+  
+  if (!dn) {
+    Inode *in = new Inode(st->inode, objectcacher);
+    inode_map[st->inode.ino] = in;
+    dn = link(dir, dname, in);
+    dout(12) << " new dentry+node with ino " << st->inode.ino << endl;
+  } else {
+    // actually update info
+    dout(12) << " stat inode mask is " << st->inode.mask << endl;
+    dn->inode->inode = st->inode;
+
+    // ...but don't clobber our mtime, size!
+    if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 &&
+        dn->inode->file_wr_size > dn->inode->inode.size) 
+      dn->inode->inode.size = dn->inode->file_wr_size;
+    if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 &&
+        dn->inode->file_wr_mtime > dn->inode->inode.mtime) 
+      dn->inode->inode.mtime = dn->inode->file_wr_mtime;
+  }
+
+  // OK, we found it!
+  assert(dn && dn->inode);
+  
+  // or do we have newer size/mtime from writing?
+  if (dn->inode->file_caps() & CAP_FILE_WR) {
+    if (dn->inode->file_wr_size > dn->inode->inode.size)
+      dn->inode->inode.size = dn->inode->file_wr_size;
+    if (dn->inode->file_wr_mtime > dn->inode->inode.mtime)
+      dn->inode->inode.mtime = dn->inode->file_wr_mtime;
+  }
+
+  // symlink?
+  if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) {
+    if (!dn->inode->symlink) 
+      dn->inode->symlink = new string;
+    *(dn->inode->symlink) = st->symlink;
+  }
+
+  return dn->inode;
+}
+
+/** update_inode_dist
+ *
+ * update MDS location cache for a single inode
+ */
+void Client::update_inode_dist(Inode *in, InodeStat *st)
+{
+  // dir info
+  in->dir_auth = st->dir_auth;
+  in->dir_hashed = st->hashed;  
+  in->dir_replicated = st->replicated;  
+  
+  // dir replication
+  if (st->spec_defined) {
+    if (st->dist.empty() && !in->dir_contacts.empty())
+      dout(9) << "lost dist spec for " << in->inode.ino 
+              << " " << st->dist << endl;
+    if (!st->dist.empty() && in->dir_contacts.empty()) 
+      dout(9) << "got dist spec for " << in->inode.ino 
+              << " " << st->dist << endl;
+    in->dir_contacts = st->dist;
+  }
+}
+
+
+/** insert_trace
+ *
+ * insert a trace from a MDS reply into the cache.
+ */
+Inode* Client::insert_trace(MClientReply *reply)
+{
+  Inode *cur = root;
+  time_t now = time(NULL);
+
+  dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl;
+
+  list<string>::const_iterator pdn = reply->get_trace_dn().begin();
+
+  for (list<InodeStat*>::const_iterator pin = reply->get_trace_in().begin();
+       pin != reply->get_trace_in().end();
+       ++pin) {
+    
+    if (pin == reply->get_trace_in().begin()) {
+      // root
+      dout(10) << "insert_trace root" << endl;
+      if (!root) {
+        // create
+        cur = root = new Inode((*pin)->inode, objectcacher);
+        inode_map[root->inode.ino] = root;
+      }
+    } else {
+      // not root.
+      dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl;
+      Dir *dir = cur->open_dir();
+      cur = this->insert_inode(dir, *pin, *pdn);
+      ++pdn;      
+
+      // move to top of lru!
+      if (cur->dn) 
+        lru.lru_touch(cur->dn);
+    }
+
+    // update dist info
+    update_inode_dist(cur, *pin);
+
+    // set cache ttl
+    if (g_conf.client_cache_stat_ttl)
+      cur->valid_until = now + g_conf.client_cache_stat_ttl;
+  }
+
+  return cur;
+}
+
+
+
+
+Dentry *Client::lookup(filepath& path)
+{
+  dout(14) << "lookup " << path << endl;
+
+  Inode *cur = root;
+  if (!cur) return NULL;
+
+  Dentry *dn = 0;
+  for (unsigned i=0; i<path.depth(); i++) {
+    dout(14) << " seg " << i << " = " << path[i] << endl;
+    if (cur->inode.mode & INODE_MODE_DIR &&
+        cur->dir) {
+      // dir, we can descend
+      Dir *dir = cur->dir;
+      if (dir->dentries.count(path[i])) {
+        dn = dir->dentries[path[i]];
+        dout(14) << " hit dentry " << path[i] << " inode is " << dn->inode << " valid_until " << dn->inode->valid_until << endl;
+      } else {
+        dout(14) << " dentry " << path[i] << " dne" << endl;
+        return NULL;
+      }
+      cur = dn->inode;
+      assert(cur);
+    } else {
+      return NULL;  // not a dir
+    }
+  }
+  
+  if (dn) {
+    dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl;
+  }
+
+  return dn;
+}
+
+// -------
+
+MClientReply *Client::make_request(MClientRequest *req, 
+                                   bool auth_best, 
+                                   int use_mds)  // this param is icky, debug weirdness!
+{
+  // assign a unique tid
+  req->set_tid(++last_tid);
+
+  // find deepest known prefix
+  Inode *diri = root;   // the deepest known containing dir
+  Inode *item = 0;      // the actual item... if we know it
+  int missing_dn = -1;  // which dn we miss on (if we miss)
+  
+  unsigned depth = req->get_filepath().depth();
+  for (unsigned i=0; i<depth; i++) {
+    // dir?
+    if (diri && diri->inode.mode & INODE_MODE_DIR && diri->dir) {
+      Dir *dir = diri->dir;
+
+      // do we have the next dentry?
+      if (dir->dentries.count( req->get_filepath()[i] ) == 0) {
+        missing_dn = i;  // no.
+        break;
+      }
+      
+      dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl;
+
+      if (i == depth-1) {  // last one!
+        item = dir->dentries[ req->get_filepath()[i] ]->inode;
+        break;
+      } 
+
+      // continue..
+      diri = dir->dentries[ req->get_filepath()[i] ]->inode;
+      assert(diri);
+    } else {
+      missing_dn = i;
+      break;
+    }
+  }
+
+  // choose an mds
+  int mds = 0;
+  if (diri) {
+    if (auth_best) {
+      // pick the actual auth (as best we can)
+      if (item) {
+        mds = item->authority(mdsmap);
+      } else if (diri->dir_hashed && missing_dn >= 0) {
+        mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+                                     mdsmap);
+      } else {
+        mds = diri->authority(mdsmap);
+      }
+    } else {
+      // balance our traffic!
+      if (diri->dir_hashed && missing_dn >= 0) 
+        mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+                                     mdsmap);
+      else 
+        mds = diri->pick_replica(mdsmap);
+    }
+  } else {
+    // no root info, pick a random MDS
+    mds = rand() % mdsmap->get_num_mds();
+  }
+  dout(20) << "mds is " << mds << endl;
+
+  // force use of a particular mds?
+  if (use_mds >= 0) mds = use_mds;
+
+
+  // time the call
+  utime_t start = g_clock.now();
+  
+  bool nojournal = false;
+  int op = req->get_op();
+  if (op == MDS_OP_STAT ||
+      op == MDS_OP_LSTAT ||
+      op == MDS_OP_READDIR ||
+      op == MDS_OP_OPEN ||
+      op == MDS_OP_RELEASE)
+    nojournal = true;
+
+  MClientReply *reply = sendrecv(req, mds);
+
+  if (client_logger) {
+    utime_t lat = g_clock.now();
+    lat -= start;
+    dout(20) << "lat " << lat << endl;
+    client_logger->finc("lsum",(double)lat);
+    client_logger->inc("lnum");
+
+    if (nojournal) {
+      client_logger->finc("lrsum",(double)lat);
+      client_logger->inc("lrnum");
+    } else {
+      client_logger->finc("lwsum",(double)lat);
+      client_logger->inc("lwnum");
+    }
+    
+    if (op == MDS_OP_STAT) {
+      client_logger->finc("lstatsum",(double)lat);
+      client_logger->inc("lstatnum");
+    }
+    else if (op == MDS_OP_READDIR) {
+      client_logger->finc("ldirsum",(double)lat);
+      client_logger->inc("ldirnum");
+    }
+
+  }
+
+  return reply;
+}
+
+
+MClientReply* Client::sendrecv(MClientRequest *req, int mds)
+{
+  // NEW way.
+  Cond cond;
+  tid_t tid = req->get_tid();
+  mds_rpc_cond[tid] = &cond;
+  
+  messenger->send_message(req, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), MDS_PORT_SERVER);
+  
+  // wait
+  while (mds_rpc_reply.count(tid) == 0) {
+    dout(20) << "sendrecv awaiting reply kick on " << &cond << endl;
+    cond.Wait(client_lock);
+  }
+  
+  // got it!
+  MClientReply *reply = mds_rpc_reply[tid];
+  
+  // kick dispatcher (we've got it!)
+  assert(mds_rpc_dispatch_cond.count(tid));
+  mds_rpc_dispatch_cond[tid]->Signal();
+  dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl;
+  
+  // clean up.
+  mds_rpc_cond.erase(tid);
+  mds_rpc_reply.erase(tid);
+
+  return reply;
+}
+
+void Client::handle_client_reply(MClientReply *reply)
+{
+  tid_t tid = reply->get_tid();
+
+  // store reply
+  mds_rpc_reply[tid] = reply;
+
+  // wake up waiter
+  assert(mds_rpc_cond.count(tid));
+  dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl;
+  mds_rpc_cond[tid]->Signal();
+
+  // wake for kick back
+  assert(mds_rpc_dispatch_cond.count(tid) == 0);
+  Cond cond;
+  mds_rpc_dispatch_cond[tid] = &cond;
+  while (mds_rpc_cond.count(tid)) {
+    dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl;
+    cond.Wait(client_lock);
+  }
+
+  // ok, clean up!
+  mds_rpc_dispatch_cond.erase(tid);
+}
+
+
+// ------------------------
+// incoming messages
+
+void Client::dispatch(Message *m)
+{
+  client_lock.Lock();
+
+  switch (m->get_type()) {
+    // osd
+  case MSG_OSD_OPREPLY:
+    objecter->handle_osd_op_reply((MOSDOpReply*)m);
+    break;
+
+  case MSG_OSD_MAP:
+    objecter->handle_osd_map((class MOSDMap*)m);
+    break;
+    
+    // client
+  case MSG_MDS_MAP:
+    handle_mds_map((MMDSMap*)m);
+    break;
+    
+  case MSG_CLIENT_REPLY:
+    handle_client_reply((MClientReply*)m);
+    break;
+
+  case MSG_CLIENT_FILECAPS:
+    handle_file_caps((MClientFileCaps*)m);
+    break;
+
+  case MSG_CLIENT_MOUNTACK:
+    handle_mount_ack((MClientMountAck*)m);
+    break;
+  case MSG_CLIENT_UNMOUNT:
+    handle_unmount_ack(m);
+    break;
+
+
+  default:
+    cout << "dispatch doesn't recognize message type " << m->get_type() << endl;
+    assert(0);  // fail loudly
+    break;
+  }
+
+  // unmounting?
+  if (unmounting) {
+    dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() 
+             << "+" << inode_map.size() << endl;
+    trim_cache();
+    if (lru.lru_get_size() == 0 && inode_map.empty()) {
+      dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl;
+      mount_cond.Signal();
+    } else {
+      dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() 
+               << "+" << inode_map.size() << endl;
+      dump_cache();      
+    }
+  }
+
+  client_lock.Unlock();
+}
+
+void Client::handle_mount_ack(MClientMountAck *m)
+{
+  // mdsmap!
+  if (!mdsmap) mdsmap = new MDSMap;
+  mdsmap->decode(m->get_mds_map_state());
+
+  // we got osdmap!
+  osdmap->decode(m->get_osd_map_state());
+
+  dout(2) << "mounted" << endl;
+  mounted = true;
+  mount_cond.Signal();
+
+  delete m;
+}
+
+
+void Client::handle_unmount_ack(Message* m)
+{
+  dout(1) << "got unmount ack" << endl;
+  mounted = false;
+  mount_cond.Signal();
+  delete m;
+}
+
+
+void Client::handle_mds_map(MMDSMap* m)
+{
+  if (mdsmap == 0)
+    mdsmap = new MDSMap;
+
+  map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
+  
+  dout(1) << "handle_mds_map epoch " << p->first << endl;
+  mdsmap->decode(p->second);
+  
+  delete m;
+
+  mount_cond.Signal();  // mount might be waiting for this.
+}
+
+
+/****
+ * caps
+ */
+
+
+class C_Client_ImplementedCaps : public Context {
+  Client *client;
+  MClientFileCaps *msg;
+  Inode *in;
+public:
+  C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {}
+  void finish(int r) {
+    client->implemented_caps(msg,in);
+  }
+};
+
+/** handle_file_caps
+ * handle caps update from mds.  including mds to mds caps transitions.
+ * do not block.
+ */
+void Client::handle_file_caps(MClientFileCaps *m)
+{
+  int mds = MSG_ADDR_NUM(m->get_source());
+  Inode *in = 0;
+  if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ];
+
+  m->clear_payload();  // for if/when we send back to MDS
+
+  // reap?
+  if (m->get_special() == MClientFileCaps::FILECAP_REAP) {
+    int other = m->get_mds();
+
+    if (in && in->stale_caps.count(other)) {
+      dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl;
+
+      // fresh from new mds?
+      if (!in->caps.count(mds)) {
+        if (in->caps.empty()) in->get();
+        in->caps[mds].seq = m->get_seq();
+        in->caps[mds].caps = m->get_caps();
+      }
+      
+      assert(in->stale_caps.count(other));
+      in->stale_caps.erase(other);
+      if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in
+      
+      // fall-thru!
+    } else {
+      dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl;
+      // delay!
+      cap_reap_queue[in->ino()][other] = m;
+      return;
+    }
+  }
+
+  assert(in);
+  
+  // stale?
+  if (m->get_special() == MClientFileCaps::FILECAP_STALE) {
+    dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl;
+    
+    // move to stale list
+    assert(in->caps.count(mds));
+    if (in->stale_caps.empty()) in->get();
+    in->stale_caps[mds] = in->caps[mds];
+
+    assert(in->caps.count(mds));
+    in->caps.erase(mds);
+    if (in->caps.empty()) in->put();
+
+    // delayed reap?
+    if (cap_reap_queue.count(in->ino()) &&
+        cap_reap_queue[in->ino()].count(mds)) {
+      dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl;
+      
+      // process delayed reap
+      handle_file_caps( cap_reap_queue[in->ino()][mds] );
+
+      cap_reap_queue[in->ino()].erase(mds);
+      if (cap_reap_queue[in->ino()].empty())
+        cap_reap_queue.erase(in->ino());
+    }
+    return;
+  }
+
+  // release?
+  if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) {
+    dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl;
+    assert(in->caps.count(mds));
+    in->caps.erase(mds);
+    for (map<int,InodeCap>::iterator p = in->caps.begin();
+         p != in->caps.end();
+         p++)
+      dout(20) << " left cap " << p->first << " " 
+              << cap_string(p->second.caps) << " " 
+              << p->second.seq << endl;
+    for (map<int,InodeCap>::iterator p = in->stale_caps.begin();
+         p != in->stale_caps.end();
+         p++)
+      dout(20) << " left stale cap " << p->first << " " 
+              << cap_string(p->second.caps) << " " 
+              << p->second.seq << endl;
+
+    if (in->caps.empty()) {
+      //dout(0) << "did put_inode" << endl;
+      put_inode(in);
+    } else {
+      //dout(0) << "didn't put_inode" << endl;
+    }
+    
+    return;
+  }
+
+
+  // don't want?
+  if (in->file_caps_wanted() == 0) {
+    dout(5) << "handle_file_caps on ino " << m->get_ino() 
+            << " seq " << m->get_seq() 
+            << " " << cap_string(m->get_caps()) 
+            << ", which we don't want caps for, releasing." << endl;
+    m->set_caps(0);
+    m->set_wanted(0);
+    entity_inst_t srcinst = m->get_source_inst();
+    messenger->send_message(m, m->get_source(), srcinst, m->get_source_port());
+    return;
+  }
+
+  assert(in->caps.count(mds));
+
+  // update per-mds caps
+  const int old_caps = in->caps[mds].caps;
+  const int new_caps = m->get_caps();
+  in->caps[mds].caps = new_caps;
+  in->caps[mds].seq = m->get_seq();
+  dout(5) << "handle_file_caps on in " << m->get_ino() 
+          << " mds" << mds << " seq " << m->get_seq() 
+          << " caps now " << cap_string(new_caps) 
+          << " was " << cap_string(old_caps) << endl;
+  
+  // did file size decrease?
+  if ((old_caps & new_caps & CAP_FILE_RDCACHE) &&
+      in->inode.size > m->get_inode().size) {
+    dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl;
+    // must have been a truncate() by someone.
+    // trim the buffer cache
+    // ***** fixme write me ****
+
+    in->file_wr_size = m->get_inode().size; //??
+  }
+
+  // update inode
+  in->inode = m->get_inode();      // might have updated size... FIXME this is overkill!
+
+  // preserve our (possibly newer) file size, mtime
+  if (in->file_wr_size > in->inode.size)
+    m->get_inode().size = in->inode.size = in->file_wr_size;
+  if (in->file_wr_mtime > in->inode.mtime)
+    m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime;
+
+  if (g_conf.client_oc) {
+    // caching on, use FileCache.
+    Context *onimplement = 0;
+    if (old_caps & ~new_caps) {     // this mds is revoking caps
+      if (in->fc.get_caps() & ~(in->file_caps()))   // net revocation
+        onimplement = new C_Client_ImplementedCaps(this, m, in);
+      else {
+        implemented_caps(m, in);        // ack now.
+      }
+    }
+    in->fc.set_caps(new_caps, onimplement);
+
+  } else {
+    // caching off.
+
+    // wake up waiters?
+    if (new_caps & CAP_FILE_RD) {
+      for (list<Cond*>::iterator it = in->waitfor_read.begin();
+           it != in->waitfor_read.end();
+           it++) {
+        dout(5) << "signaling read waiter " << *it << endl;
+        (*it)->Signal();
+      }
+      in->waitfor_read.clear();
+    }
+    if (new_caps & CAP_FILE_WR) {
+      for (list<Cond*>::iterator it = in->waitfor_write.begin();
+           it != in->waitfor_write.end();
+           it++) {
+        dout(5) << "signaling write waiter " << *it << endl;
+        (*it)->Signal();
+      }
+      in->waitfor_write.clear();
+    }
+    if (new_caps & CAP_FILE_LAZYIO) {
+      for (list<Cond*>::iterator it = in->waitfor_lazy.begin();
+           it != in->waitfor_lazy.end();
+           it++) {
+        dout(5) << "signaling lazy waiter " << *it << endl;
+        (*it)->Signal();
+      }
+      in->waitfor_lazy.clear();
+    }
+
+    // ack?
+    if (old_caps & ~new_caps) {
+      if (in->sync_writes) {
+        // wait for sync writes to finish
+        dout(5) << "sync writes in progress, will ack on finish" << endl;
+        in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in));
+      } else {
+        // ok now
+        implemented_caps(m, in);
+      }
+    } else {
+      // discard
+      delete m;
+    }
+  }
+}
+
+void Client::implemented_caps(MClientFileCaps *m, Inode *in)
+{
+  dout(5) << "implemented_caps " << cap_string(m->get_caps()) 
+          << ", acking to " << m->get_source() << endl;
+
+  if (in->file_caps() == 0) {
+    in->file_wr_mtime = 0;
+    in->file_wr_size = 0;
+  }
+
+  messenger->send_message(m, m->get_source(), m->get_source_port());
+}
+
+
+void Client::release_caps(Inode *in,
+                          int retain)
+{
+  dout(5) << "releasing caps on ino " << in->inode.ino << dec
+          << " had " << cap_string(in->file_caps())
+          << " retaining " << cap_string(retain) 
+          << endl;
+  
+  for (map<int,InodeCap>::iterator it = in->caps.begin();
+       it != in->caps.end();
+       it++) {
+    //if (it->second.caps & ~retain) {
+    if (1) {
+      // release (some of?) these caps
+      it->second.caps = retain & it->second.caps;
+      // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do
+      MClientFileCaps *m = new MClientFileCaps(in->inode, 
+                                               it->second.seq,
+                                               it->second.caps,
+                                               in->file_caps_wanted()); 
+      messenger->send_message(m, MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+    }
+  }
+  
+  if (in->file_caps() == 0) {
+    in->file_wr_mtime = 0;
+    in->file_wr_size = 0;
+  }
+}
+
+void Client::update_caps_wanted(Inode *in)
+{
+  dout(5) << "updating caps wanted on ino " << in->inode.ino 
+          << " to " << cap_string(in->file_caps_wanted())
+          << endl;
+  
+  // FIXME: pick a single mds and let the others off the hook..
+  for (map<int,InodeCap>::iterator it = in->caps.begin();
+       it != in->caps.end();
+       it++) {
+    MClientFileCaps *m = new MClientFileCaps(in->inode, 
+                                             it->second.seq,
+                                             it->second.caps,
+                                             in->file_caps_wanted());
+    messenger->send_message(m,
+                            MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+  }
+}
+
+
+
+// -------------------
+// fs ops
+
+int Client::mount(int mkfs)
+{
+  client_lock.Lock();
+
+  assert(!mounted);  // caller is confused?
+
+  // FIXME mds map update race with mount.
+
+  dout(2) << "fetching latest mds map" << endl;
+  if (mdsmap) 
+    delete mdsmap;
+  int mon = monmap->pick_mon();
+  messenger->send_message(new MMDSGetMap(),
+			  MSG_ADDR_MON(mon), monmap->get_inst(mon));
+
+  while (!mdsmap)
+    mount_cond.Wait(client_lock);
+  
+  dout(2) << "mounting" << endl;
+  MClientMount *m = new MClientMount();
+  if (mkfs) m->set_mkfs(mkfs);
+
+  messenger->send_message(m, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER);
+
+  while (!mounted)
+    mount_cond.Wait(client_lock);
+
+  client_lock.Unlock();
+
+  /*
+  dout(3) << "op: // client trace data structs" << endl;
+  dout(3) << "op: struct stat st;" << endl;
+  dout(3) << "op: struct utimbuf utim;" << endl;
+  dout(3) << "op: int readlinkbuf_len = 1000;" << endl;
+  dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl;
+  dout(3) << "op: map<string, inode_t*> dir_contents;" << endl;
+  dout(3) << "op: map<fh_t, fh_t> open_files;" << endl;
+  dout(3) << "op: fh_t fh;" << endl;
+  */
+  return 0;
+}
+
+int Client::unmount()
+{
+  client_lock.Lock();
+
+  assert(mounted);  // caller is confused?
+
+  dout(2) << "unmounting" << endl;
+  unmounting = true;
+
+  // NOTE: i'm assuming all caches are already flushing (because all files are closed).
+  assert(fh_map.empty());
+  
+  // empty lru cache
+  lru.lru_set_max(0);
+  trim_cache();
+
+  if (g_conf.client_oc) {
+    // release any/all caps
+    for (hash_map<inodeno_t, Inode*>::iterator p = inode_map.begin();
+         p != inode_map.end();
+         p++) {
+      Inode *in = p->second;
+      if (!in->caps.empty()) {
+        in->fc.release_clean();
+        if (in->fc.is_dirty()) {
+          dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl;
+          in->fc.empty(new C_Client_CloseRelease(this, in));
+        } else {
+          dout(10) << "unmount residual caps on " << in->ino()  << ", releasing" << endl;
+          release_caps(in);
+        }
+      }
+    }
+  }
+
+  while (lru.lru_get_size() > 0 || 
+         !inode_map.empty()) {
+    dout(2) << "cache still has " << lru.lru_get_size() 
+            << "+" << inode_map.size() << " items" 
+            << ", waiting (presumably for safe or for caps to be released?)"
+            << endl;
+    dump_cache();
+    mount_cond.Wait(client_lock);
+  }
+  assert(lru.lru_get_size() == 0);
+  assert(inode_map.empty());
+  
+  // unsafe writes
+  if (!g_conf.client_oc) {
+    while (unsafe_sync_write > 0) {
+      dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" 
+              << endl;
+      mount_cond.Wait(client_lock);
+    }
+  }
+  
+  // send unmount!
+  Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT);
+  messenger->send_message(req, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER);
+
+  while (mounted)
+    mount_cond.Wait(client_lock);
+
+  dout(2) << "unmounted" << endl;
+
+  client_lock.Unlock();
+  return 0;
+}
+
+
+
+// namespace ops
+
+int Client::link(const char *existing, const char *newname) 
+{
+  client_lock.Lock();
+  dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl;
+  tout << "link" << endl;
+  tout << existing << endl;
+  tout << newname << endl;
+
+
+  // main path arg is new link name
+  // sarg is target (existing file)
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami);
+  req->set_path(newname);
+  req->set_sarg(existing);
+  
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+  
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  
+  insert_trace(reply);
+  delete reply;
+  dout(10) << "link result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+
+int Client::unlink(const char *relpath)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->unlink\(\"" << path << "\");" << endl;
+  tout << "unlink" << endl;
+  tout << path << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami);
+  req->set_path(path);
+ 
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  if (res == 0) {
+    // remove from local cache
+    filepath fp(path);
+    Dentry *dn = lookup(fp);
+    if (dn) {
+      assert(dn->inode);
+      unlink(dn);
+    }
+  }
+  insert_trace(reply);
+  delete reply;
+  dout(10) << "unlink result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+int Client::rename(const char *relfrom, const char *relto)
+{
+  client_lock.Lock();
+
+  string absfrom;
+  mkabspath(relfrom, absfrom);
+  const char *from = absfrom.c_str();
+  string absto;
+  mkabspath(relto, absto);
+  const char *to = absto.c_str();
+
+  dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl;
+  tout << "rename" << endl;
+  tout << from << endl;
+  tout << to << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami);
+  req->set_path(from);
+  req->set_sarg(to);
+ 
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);
+  delete reply;
+  dout(10) << "rename result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+// dirs
+
+int Client::mkdir(const char *relpath, mode_t mode)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl;
+  tout << "mkdir" << endl;
+  tout << path << endl;
+  tout << mode << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami);
+  req->set_path(path);
+  req->set_iarg( (int)mode );
+ 
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);
+  delete reply;
+  dout(10) << "mkdir result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+int Client::rmdir(const char *relpath)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->rmdir(\"" << path << "\");" << endl;
+  tout << "rmdir" << endl;
+  tout << path << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami);
+  req->set_path(path);
+ 
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  if (res == 0) {
+    // remove from local cache
+    filepath fp(path);
+    Dentry *dn = lookup(fp);
+    if (dn) {
+      if (dn->inode->dir && dn->inode->dir->is_empty()) 
+        close_dir(dn->inode->dir);  // FIXME: maybe i shoudl proactively hose the whole subtree from cache?
+      unlink(dn);
+    }
+  }
+  insert_trace(reply);  
+  delete reply;
+  dout(10) << "rmdir result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+// symlinks
+  
+int Client::symlink(const char *reltarget, const char *rellink)
+{
+  client_lock.Lock();
+
+  string abstarget;
+  mkabspath(reltarget, abstarget);
+  const char *target = abstarget.c_str();
+  string abslink;
+  mkabspath(rellink, abslink);
+  const char *link = abslink.c_str();
+
+  dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl;
+  tout << "symlink" << endl;
+  tout << target << endl;
+  tout << link << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami);
+  req->set_path(link);
+  req->set_sarg(target);
+ 
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  //FIXME assuming trace of link, not of target
+  delete reply;
+  dout(10) << "symlink result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+int Client::readlink(const char *relpath, char *buf, off_t size) 
+{ 
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl;
+  tout << "readlink" << endl;
+  tout << path << endl;
+  client_lock.Unlock();
+
+  // stat first  (FIXME, PERF access cache directly) ****
+  struct stat stbuf;
+  int r = this->lstat(path, &stbuf);
+  if (r != 0) return r;
+
+  client_lock.Lock();
+
+  // pull symlink content from cache
+  Inode *in = inode_map[stbuf.st_ino];
+  assert(in);  // i just did a stat
+  
+  // copy into buf (at most size bytes)
+  unsigned res = in->symlink->length();
+  if (res > size) res = size;
+  memcpy(buf, in->symlink->c_str(), res);
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;  // return length in bytes (to mimic the system call)
+}
+
+
+
+// inode stuff
+
+int Client::_lstat(const char *path, int mask, Inode **in)
+{  
+  MClientRequest *req = 0;
+  filepath fpath(path);
+  
+  // check whether cache content is fresh enough
+  int res = 0;
+
+  Dentry *dn = lookup(fpath);
+  inode_t inode;
+  time_t now = time(NULL);
+  if (dn && 
+      now <= dn->inode->valid_until &&
+      ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) {
+    inode = dn->inode->inode;
+    dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl;
+    
+    if (g_conf.client_cache_stat_ttl == 0)
+      dn->inode->valid_until = 0;           // only one stat allowed after each readdir
+
+    *in = dn->inode;
+  } else {  
+    // FIXME where does FUSE maintain user information
+    //struct fuse_context *fc = fuse_get_context();
+    //req->set_caller_uid(fc->uid);
+    //req->set_caller_gid(fc->gid);
+    
+    req = new MClientRequest(MDS_OP_LSTAT, whoami);
+    req->set_iarg(mask);
+    req->set_path(fpath);
+
+    MClientReply *reply = make_request(req);
+    res = reply->get_result();
+    dout(10) << "lstat res is " << res << endl;
+    if (res == 0) {
+      //Transfer information from reply to stbuf
+      inode = reply->get_inode();
+      
+      //Update metadata cache
+      *in = insert_trace(reply);
+    }
+
+    delete reply;
+
+    if (res != 0) 
+      *in = 0;     // not a success.
+  }
+     
+  return res;
+}
+
+
+void Client::fill_stat(inode_t& inode, struct stat *st) 
+{
+  memset(st, 0, sizeof(struct stat));
+  st->st_ino = inode.ino;
+  st->st_mode = inode.mode;
+  st->st_nlink = inode.nlink;
+  st->st_uid = inode.uid;
+  st->st_gid = inode.gid;
+  st->st_ctime = inode.ctime;
+  st->st_atime = inode.atime;
+  st->st_mtime = inode.mtime;
+  st->st_size = inode.size;
+  st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
+  st->st_blksize = 4096;
+}
+
+void Client::fill_statlite(inode_t& inode, struct statlite *st) 
+{
+  memset(st, 0, sizeof(struct stat));
+  st->st_ino = inode.ino;
+  st->st_mode = inode.mode;
+  st->st_nlink = inode.nlink;
+  st->st_uid = inode.uid;
+  st->st_gid = inode.gid;
+  st->st_ctime = inode.ctime;
+  st->st_atime = inode.atime;
+  st->st_mtime = inode.mtime;
+  st->st_size = inode.size;
+  st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
+  st->st_blksize = 4096;
+  
+  /*
+  S_REQUIREBLKSIZE(st->st_litemask);
+  if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask);
+  if (inode.mask & INODE_MASK_SIZE) {
+    S_REQUIRESIZE(st->st_litemask);
+    S_REQUIREBLOCKS(st->st_litemask);
+  }
+  if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask);
+  if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask);
+  */
+}
+
+
+int Client::lstat(const char *relpath, struct stat *stbuf)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl;
+  tout << "lstat" << endl;
+  tout << path << endl;
+
+  Inode *in = 0;
+
+  int res = _lstat(path, INODE_MASK_ALL_STAT, &in);
+  if (res == 0) {
+    assert(in);
+    fill_stat(in->inode,stbuf);
+    dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl;
+  }
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+
+int Client::lstatlite(const char *relpath, struct statlite *stl)
+{
+  client_lock.Lock();
+   
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl;
+  tout << "lstatlite" << endl;
+  tout << path << endl;
+
+  // make mask
+  int mask = INODE_MASK_BASE | INODE_MASK_PERM;
+  if (S_ISVALIDSIZE(stl->st_litemask) || 
+      S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE;
+  if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME;
+  if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME;
+  
+  Inode *in = 0;
+  int res = _lstat(path, mask, &in);
+  
+  if (res == 0) {
+    fill_statlite(in->inode,stl);
+    dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl;
+  }
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+
+
+int Client::chmod(const char *relpath, mode_t mode)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl;
+  tout << "chmod" << endl;
+  tout << path << endl;
+  tout << mode << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami);
+  req->set_path(path); 
+  req->set_iarg( (int)mode );
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+  
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+  delete reply;
+  dout(10) << "chmod result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+int Client::chown(const char *relpath, uid_t uid, gid_t gid)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl;
+  tout << "chown" << endl;
+  tout << path << endl;
+  tout << uid << endl;
+  tout << gid << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami);
+  req->set_path(path); 
+  req->set_iarg( (int)uid );
+  req->set_iarg2( (int)gid );
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+  delete reply;
+  dout(10) << "chown result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+int Client::utime(const char *relpath, struct utimbuf *buf)
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl;
+  dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl;
+  tout << "utime" << endl;
+  tout << path << endl;
+  tout << buf->actime << endl;
+  tout << buf->modtime << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami);
+  req->set_path(path); 
+  req->set_targ( buf->modtime );
+  req->set_targ2( buf->actime );
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+  delete reply;
+  dout(10) << "utime result is " << res << endl;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+
+
+int Client::mknod(const char *relpath, mode_t mode) 
+{ 
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl;
+  tout << "mknod" << endl;
+  tout << path << endl;
+  tout << mode << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami);
+  req->set_path(path); 
+  req->set_iarg( mode );
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+
+  dout(10) << "mknod result is " << res << endl;
+
+  delete reply;
+
+  trim_cache();
+  client_lock.Unlock();
+  return res;
+}
+
+
+
+  
+//readdir usually include inode info for each entry except of locked entries
+
+//
+// getdir
+
+// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino);
+
+int Client::getdir(const char *relpath, map<string,inode_t>& contents) 
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl;
+  tout << "getdir" << endl;
+  tout << path << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami);
+  req->set_path(path); 
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+
+  //FIXME enforce caller uid rights?
+   
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+
+  if (res == 0) {
+
+    // dir contents to cache!
+    inodeno_t ino = reply->get_ino();
+    Inode *diri = inode_map[ ino ];
+    assert(diri);
+    assert(diri->inode.mode & INODE_MODE_DIR);
+
+    if (!reply->get_dir_in().empty()) {
+      // only open dir if we're actually adding stuff to it!
+      Dir *dir = diri->open_dir();
+      assert(dir);
+      time_t now = time(NULL);
+      
+      list<string>::const_iterator pdn = reply->get_dir_dn().begin();
+      for (list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
+           pin != reply->get_dir_in().end(); 
+           ++pin, ++pdn) {
+        // count entries
+        res++;
+
+        // put in cache
+        Inode *in = this->insert_inode(dir, *pin, *pdn);
+        
+        if (g_conf.client_cache_stat_ttl)
+          in->valid_until = now + g_conf.client_cache_stat_ttl;
+        else if (g_conf.client_cache_readdir_ttl)
+          in->valid_until = now + g_conf.client_cache_readdir_ttl;
+        
+        // contents to caller too!
+        contents[*pdn] = in->inode;
+      }
+    }
+    
+    // add .. too?
+    if (diri != root && diri->dn && diri->dn->dir) {
+      Inode *parent = diri->dn->dir->parent_inode;
+      contents[".."] = parent->inode;
+    }    
+
+    // FIXME: remove items in cache that weren't in my readdir?
+    // ***
+  }
+
+  delete reply;     //fix thing above first
+
+  client_lock.Unlock();
+  return res;
+}
+
+
+/** POSIX stubs **/
+
+DIR *Client::opendir(const char *name) 
+{
+  DirResult *d = new DirResult;
+  d->size = getdir(name, d->contents);
+  d->p = d->contents.begin();
+  d->off = 0;
+  return (DIR*)d;
+}
+
+int Client::closedir(DIR *dir) 
+{
+  DirResult *d = (DirResult*)dir;
+  delete d;
+  return 0;
+}
+
+//struct dirent {
+//  ino_t          d_ino;       /* inode number */
+//  off_t          d_off;       /* offset to the next dirent */
+//  unsigned short d_reclen;    /* length of this record */
+//  unsigned char  d_type;      /* type of file */
+//  char           d_name[256]; /* filename */
+//};
+
+struct dirent *Client::readdir(DIR *dirp)
+{
+  DirResult *d = (DirResult*)dirp;
+
+  // end of dir?
+  if (d->p == d->contents.end()) 
+    return 0;
+
+  // fill the dirent
+  d->dp.d_dirent.d_ino = d->p->second.ino;
+#ifndef __CYGWIN__
+  if (d->p->second.is_symlink())
+    d->dp.d_dirent.d_type = DT_LNK;
+  else if (d->p->second.is_dir())
+    d->dp.d_dirent.d_type = DT_DIR;
+  else if (d->p->second.is_file())
+    d->dp.d_dirent.d_type = DT_REG;
+  else
+    d->dp.d_dirent.d_type = DT_UNKNOWN;
+
+  d->dp.d_dirent.d_off = d->off;
+  d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+#endif
+
+  strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+  // move up
+  ++d->off;
+  ++d->p;
+
+  return &d->dp.d_dirent;
+}
+ 
+void Client::rewinddir(DIR *dirp)
+{
+  DirResult *d = (DirResult*)dirp;
+  d->p = d->contents.begin();
+  d->off = 0;
+}
+ 
+off_t Client::telldir(DIR *dirp)
+{
+  DirResult *d = (DirResult*)dirp;
+  return d->off;
+}
+
+void Client::seekdir(DIR *dirp, off_t offset)
+{
+  DirResult *d = (DirResult*)dirp;
+
+  d->p = d->contents.begin();
+  d->off = 0;
+
+  if (offset >= d->size) offset = d->size-1;
+  while (offset > 0) {
+    ++d->p;
+    ++d->off;
+    --offset;
+  }
+}
+
+struct dirent_plus *Client::readdirplus(DIR *dirp)
+{
+  DirResult *d = (DirResult*)dirp;
+
+  // end of dir?
+  if (d->p == d->contents.end()) 
+    return 0;
+
+  // fill the dirent
+  d->dp.d_dirent.d_ino = d->p->second.ino;
+#ifndef __CYGWIN__
+  if (d->p->second.is_symlink())
+    d->dp.d_dirent.d_type = DT_LNK;
+  else if (d->p->second.is_dir())
+    d->dp.d_dirent.d_type = DT_DIR;
+  else if (d->p->second.is_file())
+    d->dp.d_dirent.d_type = DT_REG;
+  else
+    d->dp.d_dirent.d_type = DT_UNKNOWN;
+
+  d->dp.d_dirent.d_off = d->off;
+  d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+#endif
+
+  strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+  // plus
+  if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
+    // have it
+    fill_stat(d->p->second, &d->dp.d_stat);
+    d->dp.d_stat_err = 0;
+  } else {
+    // don't have it, stat it
+    string path = d->path;
+    path += "/";
+    path += d->p->first;
+    d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat);
+  }
+
+  // move up
+  ++d->off;
+  ++d->p;
+
+  return &d->dp;
+}
+
+/*
+struct dirent_lite *Client::readdirlite(DIR *dirp)
+{
+  DirResult *d = (DirResult*)dirp;
+
+  // end of dir?
+  if (d->p == d->contents.end()) 
+    return 0;
+
+  // fill the dirent
+  d->dp.d_dirent.d_ino = d->p->second.ino;
+  if (d->p->second.is_symlink())
+    d->dp.d_dirent.d_type = DT_LNK;
+  else if (d->p->second.is_dir())
+    d->dp.d_dirent.d_type = DT_DIR;
+  else if (d->p->second.is_file())
+    d->dp.d_dirent.d_type = DT_REG;
+  else
+    d->dp.d_dirent.d_type = DT_UNKNOWN;
+  strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+  d->dp.d_dirent.d_off = d->off;
+  d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+
+  // plus
+  if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
+    // have it
+    fill_statlite(d->p->second,d->dp.d_stat);
+    d->dp.d_stat_err = 0;
+  } else {
+    // don't have it, stat it
+    string path = p->path;
+    path += "/";
+    path += p->first;
+    d->dp.d_statlite
+    d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite);
+  }
+
+  // move up
+  ++d->off;
+  ++d->p;
+
+  return &d->dp;
+}
+*/
+
+
+
+
+
+
+/****** file i/o **********/
+
+int Client::open(const char *relpath, int flags) 
+{
+  client_lock.Lock();
+
+  string abspath;
+  mkabspath(relpath, abspath);
+  const char *path = abspath.c_str();
+
+  dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl;
+  tout << "open" << endl;
+  tout << path << endl;
+  tout << flags << endl;
+
+  int cmode = 0;
+  bool tryauth = false;
+  if (flags & O_LAZY) 
+    cmode = FILE_MODE_LAZY;
+  else if (flags & O_WRONLY) {
+    cmode = FILE_MODE_W;
+    tryauth = true;
+  } else if (flags & O_RDWR) {
+    cmode = FILE_MODE_RW;
+    tryauth = true;
+  } else if (flags & O_APPEND) {
+    cmode = FILE_MODE_W;
+    tryauth = true;
+  } else
+    cmode = FILE_MODE_R;
+
+  // go
+  MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami);
+  req->set_path(path); 
+  req->set_iarg(flags);
+  req->set_iarg2(cmode);
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+  
+  MClientReply *reply = make_request(req, tryauth); // try auth if writer
+  
+  assert(reply);
+  dout(3) << "op: open_files[" << reply->get_result() << "] = fh;  // fh = " << reply->get_result() << endl;
+  tout << reply->get_result() << endl;
+
+  insert_trace(reply);  
+  int result = reply->get_result();
+
+  // success?
+  fh_t fh = 0;
+  if (result >= 0) {
+    // yay
+    Fh *f = new Fh;
+    f->mode = cmode;
+
+    // inode
+    f->inode = inode_map[reply->get_ino()];
+    assert(f->inode);
+    f->inode->get();
+
+    if (cmode & FILE_MODE_R) f->inode->num_open_rd++;
+    if (cmode & FILE_MODE_W) f->inode->num_open_wr++;
+    if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++;
+
+    // caps included?
+    int mds = MSG_ADDR_NUM(reply->get_source());
+
+    if (f->inode->caps.empty()) {// first caps?
+      dout(7) << " first caps on " << f->inode->inode.ino << endl;
+      f->inode->get();
+    }
+
+    int new_caps = reply->get_file_caps();
+
+    assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq);
+    if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) {   
+      dout(7) << "open got caps " << cap_string(new_caps)
+              << " for " << f->inode->ino() 
+              << " seq " << reply->get_file_caps_seq() 
+              << " from mds" << mds << endl;
+
+      int old_caps = f->inode->caps[mds].caps;
+      f->inode->caps[mds].caps = new_caps;
+      f->inode->caps[mds].seq = reply->get_file_caps_seq();
+
+      // we shouldn't ever lose caps at this point.
+      // actually, we might...?
+      assert((old_caps & ~f->inode->caps[mds].caps) == 0);
+
+      if (g_conf.client_oc)
+        f->inode->fc.set_caps(new_caps);
+
+    } else {
+      dout(7) << "open got SAME caps " << cap_string(new_caps) 
+              << " for " << f->inode->ino() 
+              << " seq " << reply->get_file_caps_seq() 
+              << " from mds" << mds << endl;
+    }
+    
+    // put in map
+    result = fh = get_fh();
+    assert(fh_map.count(fh) == 0);
+    fh_map[fh] = f;
+    
+    dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl;
+  } else {
+    dout(0) << "open failure result " << result << endl;
+  }
+
+  delete reply;
+
+  trim_cache();
+  client_lock.Unlock();
+
+  return result;
+}
+
+
+
+
+
+void Client::close_release(Inode *in)
+{
+  dout(10) << "close_release on " << in->ino() << endl;
+
+  if (!in->num_open_rd) 
+    in->fc.release_clean();
+
+  int retain = 0;
+  if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER;
+  if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER;
+
+  release_caps(in, retain);              // release caps now.
+}
+
+void Client::close_safe(Inode *in)
+{
+  dout(10) << "close_safe on " << in->ino() << endl;
+  put_inode(in);
+  if (unmounting) 
+    mount_cond.Signal();
+}
+
+int Client::close(fh_t fh)
+{
+  client_lock.Lock();
+  dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl;
+  dout(3) << "op: open_files.erase( " << fh << " );" << endl;
+  tout << "close" << endl;
+  tout << fh << endl;
+
+  // get Fh, Inode
+  assert(fh_map.count(fh));
+  Fh *f = fh_map[fh];
+  Inode *in = f->inode;
+
+  // update inode rd/wr counts
+  int before = in->file_caps_wanted();
+  if (f->mode & FILE_MODE_R)     
+    in->num_open_rd--;
+  if (f->mode & FILE_MODE_W)
+    in->num_open_wr--;
+  int after = in->file_caps_wanted();
+
+  // does this change what caps we want?
+  if (before != after && after)
+    update_caps_wanted(in);
+
+  // hose fh
+  fh_map.erase(fh);
+  delete f;
+
+  // release caps right away?
+  dout(10) << "num_open_rd " << in->num_open_rd << "  num_open_wr " << in->num_open_wr << endl;
+
+  if (g_conf.client_oc) {
+    // caching on.
+    if (in->num_open_rd == 0 && in->num_open_wr == 0) {
+      in->fc.empty(new C_Client_CloseRelease(this, in));
+    } 
+    else if (in->num_open_rd == 0) {
+      in->fc.release_clean();
+      close_release(in);
+    } 
+    else if (in->num_open_wr == 0) {
+      in->fc.flush_dirty(new C_Client_CloseRelease(this,in));
+    }
+
+    // pin until safe?
+    if (in->num_open_wr == 0 && !in->fc.all_safe()) {
+      dout(10) << "pinning ino " << in->ino() << " until safe" << endl;
+      in->get();
+      in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in));
+    }
+  } else {
+    // caching off.
+    if (in->num_open_rd == 0 && in->num_open_wr == 0) {
+      dout(10) << "  releasing caps on " << in->ino() << endl;
+      release_caps(in);              // release caps now.
+    }
+  }
+  
+  put_inode( in );
+  int result = 0;
+
+  client_lock.Unlock();
+  return result;
+}
+
+
+
+// ------------
+// read, write
+
+// blocking osd interface
+
+int Client::read(fh_t fh, char *buf, off_t size, off_t offset) 
+{
+  client_lock.Lock();
+
+  dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << ");   // that's " << offset << "~" << size << endl;
+  tout << "read" << endl;
+  tout << fh << endl;
+  tout << size << endl;
+  tout << offset << endl;
+
+  assert(offset >= 0);
+  assert(fh_map.count(fh));
+  Fh *f = fh_map[fh];
+  Inode *in = f->inode;
+
+  if (offset < 0) 
+    offset = f->pos;
+
+  bool lazy = f->mode == FILE_MODE_LAZY;
+  
+  // do we have read file cap?
+  while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) {
+    dout(7) << " don't have read cap, waiting" << endl;
+    Cond cond;
+    in->waitfor_read.push_back(&cond);
+    cond.Wait(client_lock);
+  }  
+  // lazy cap?
+  while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+     dout(7) << " don't have lazy cap, waiting" << endl;
+    Cond cond;
+    in->waitfor_lazy.push_back(&cond);
+    cond.Wait(client_lock);
+  }
+ 
+  // determine whether read range overlaps with file
+  // ...ONLY if we're doing async io
+  if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) {
+    // we're doing buffered i/o.  make sure we're inside the file.
+    // we can trust size info bc we get accurate info when buffering/caching caps are issued.
+    dout(10) << "file size: " << in->inode.size << endl;
+    if (offset > 0 && offset >= in->inode.size) {
+      client_lock.Unlock();
+      return 0;
+    }
+    if (offset + size > (unsigned)in->inode.size) size = (unsigned)in->inode.size - offset;
+    
+    if (size == 0) {
+      dout(10) << "read is size=0, returning 0" << endl;
+      client_lock.Unlock();
+      return 0;
+    }
+  } else {
+    // unbuffered, synchronous file i/o.  
+    // or lazy.
+    // defer to OSDs for file bounds.
+  }
+  
+  bufferlist blist;   // data will go here
+  int rvalue = 0;
+  int r = 0;
+
+  if (g_conf.client_oc) {
+    // object cache ON
+    rvalue = r = in->fc.read(offset, size, blist, client_lock);  // may block.
+  } else {
+    // object cache OFF -- legacy inconsistent way.
+    Cond cond;
+    bool done = false;
+    C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
+
+    r = filer->read(in->inode, offset, size, &blist, onfinish);
+
+    assert(r >= 0);
+
+    // wait!
+    while (!done)
+      cond.Wait(client_lock);
+  }
+
+  // adjust fd pos
+  f->pos = offset+blist.length();
+
+  // copy data into caller's char* buf
+  blist.copy(0, blist.length(), buf);
+
+  //dout(10) << "i read '" << blist.c_str() << "'" << endl;
+  dout(10) << "read rvalue " << rvalue << ", r " << r << endl;
+
+  // done!
+  client_lock.Unlock();
+  return rvalue;
+}
+
+
+
+/*
+ * hack -- 
+ *  until we properly implement synchronous writes wrt buffer cache,
+ *  make sure we delay shutdown until they're all safe on disk!
+ */
+class C_Client_HackUnsafe : public Context {
+  Client *cl;
+public:
+  C_Client_HackUnsafe(Client *c) : cl(c) {}
+  void finish(int) {
+    cl->hack_sync_write_safe();
+  }
+};
+
+void Client::hack_sync_write_safe()
+{
+  client_lock.Lock();
+  assert(unsafe_sync_write > 0);
+  unsafe_sync_write--;
+  if (unsafe_sync_write == 0 && unmounting) {
+    dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl;
+    mount_cond.Signal();
+  }
+  client_lock.Unlock();
+}
+
+int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) 
+{
+  client_lock.Lock();
+
+  //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl;
+  dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl;
+  tout << "write" << endl;
+  tout << fh << endl;
+  tout << size << endl;
+  tout << offset << endl;
+
+  assert(offset >= 0);
+  assert(fh_map.count(fh));
+  Fh *f = fh_map[fh];
+  Inode *in = f->inode;
+
+  if (offset < 0) 
+    offset = f->pos;
+
+  bool lazy = f->mode == FILE_MODE_LAZY;
+
+  dout(10) << "cur file size is " << in->inode.size << "    wr size " << in->file_wr_size << endl;
+
+  // do we have write file cap?
+  while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) {
+    dout(7) << " don't have write cap, waiting" << endl;
+    Cond cond;
+    in->waitfor_write.push_back(&cond);
+    cond.Wait(client_lock);
+  }
+  while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+    dout(7) << " don't have lazy cap, waiting" << endl;
+    Cond cond;
+    in->waitfor_lazy.push_back(&cond);
+    cond.Wait(client_lock);
+  }
+
+  // adjust fd pos
+  f->pos = offset+size;
+
+  // time it.
+  utime_t start = g_clock.now();
+    
+  // copy into fresh buffer (since our write may be resub, async)
+  bufferptr bp = buffer::copy(buf, size);
+  bufferlist blist;
+  blist.push_back( bp );
+
+  if (g_conf.client_oc) { // buffer cache ON?
+    assert(objectcacher);
+
+    // write (this may block!)
+    in->fc.write(offset, size, blist, client_lock);
+
+  } else {
+    // legacy, inconsistent synchronous write.
+    dout(7) << "synchronous write" << endl;
+
+    // prepare write
+    Cond cond;
+    bool done = false;
+    C_Cond *onfinish = new C_Cond(&cond, &done);
+    C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this);
+    unsafe_sync_write++;
+    in->sync_writes++;
+    
+    dout(20) << " sync write start " << onfinish << endl;
+    
+    filer->write(in->inode, offset, size, blist, 0, 
+                 onfinish, onsafe
+		 //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots
+		 ); 
+    
+    while (!done) {
+      cond.Wait(client_lock);
+      dout(20) << " sync write bump " << onfinish << endl;
+    }
+
+    in->sync_writes--;
+    if (in->sync_writes == 0 &&
+        !in->waitfor_no_write.empty()) {
+      for (list<Context*>::iterator i = in->waitfor_no_write.begin();
+           i != in->waitfor_no_write.end();
+           i++)
+        (*i)->finish(0);
+      in->waitfor_no_write.clear();
+    }
+
+    dout(20) << " sync write done " << onfinish << endl;
+  }
+
+  // time
+  utime_t lat = g_clock.now();
+  lat -= start;
+  if (client_logger) {
+    client_logger->finc("wrlsum",(double)lat);
+    client_logger->inc("wrlnum");
+  }
+    
+  // assume success for now.  FIXME.
+  off_t totalwritten = size;
+  
+  // extend file?
+  if (totalwritten + offset > in->inode.size) {
+    in->inode.size = in->file_wr_size = totalwritten + offset;
+    dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl;
+  } else {
+    dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl;
+  }
+
+  // mtime
+  in->file_wr_mtime = in->inode.mtime = g_clock.gettime();
+
+  // ok!
+  client_lock.Unlock();
+  return totalwritten;  
+}
+
+
+int Client::truncate(const char *file, off_t size) 
+{
+  client_lock.Lock();
+  dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl;
+  tout << "truncate" << endl;
+  tout << file << endl;
+  tout << size << endl;
+
+
+  MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami);
+  req->set_path(file); 
+  req->set_sizearg( size );
+
+  // FIXME where does FUSE maintain user information
+  req->set_caller_uid(getuid());
+  req->set_caller_gid(getgid());
+  
+  MClientReply *reply = make_request(req, true);
+  int res = reply->get_result();
+  insert_trace(reply);  
+  delete reply;
+
+  dout(10) << " truncate result is " << res << endl;
+
+  client_lock.Unlock();
+  return res;
+}
+
+
+int Client::fsync(fh_t fh, bool syncdataonly) 
+{
+  client_lock.Lock();
+  dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl;
+  tout << "fsync" << endl;
+  tout << fh << endl;
+  tout << syncdataonly << endl;
+
+  int r = 0;
+
+  assert(fh_map.count(fh));
+  Fh *f = fh_map[fh];
+  Inode *in = f->inode;
+
+  dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl;
+
+  // metadata?
+  if (!syncdataonly) {
+    dout(0) << "fsync - not syncing metadata yet.. implement me" << endl;
+  }
+
+  // data?
+  Cond cond;
+  bool done = false;
+  if (!objectcacher->commit_set(in->ino(),
+                                new C_Cond(&cond, &done))) {
+    // wait for callback
+    while (!done) cond.Wait(client_lock);
+  }
+
+  client_lock.Unlock();
+  return r;
+}
+
+
+// not written yet, but i want to link!
+
+int Client::chdir(const char *path)
+{
+  // fake it for now!
+  string abs;
+  mkabspath(path, abs);
+  dout(3) << "chdir " << path << " -> cwd now " << abs << endl;
+  cwd = abs;
+  return 0;
+}
+
+int Client::statfs(const char *path, struct statfs *stbuf) 
+{
+  assert(0);  // implement me
+  return 0;
+}
+
+
+
+int Client::lazyio_propogate(int fd, off_t offset, size_t count)
+{
+  client_lock.Lock();
+  dout(3) << "op: client->lazyio_propogate(" << fd
+          << ", " << offset << ", " << count << ")" << endl;
+  
+  assert(fh_map.count(fd));
+  Fh *f = fh_map[fd];
+  Inode *in = f->inode;
+
+  if (f->mode & FILE_MODE_LAZY) {
+    // wait for lazy cap
+    while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+      dout(7) << " don't have lazy cap, waiting" << endl;
+      Cond cond;
+      in->waitfor_lazy.push_back(&cond);
+      cond.Wait(client_lock);
+    }
+
+    if (g_conf.client_oc) {
+      Cond cond;
+      bool done = false;
+      in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done));
+      
+      while (!done)
+        cond.Wait(client_lock);
+      
+    } else {
+      // mmm, nothin to do.
+    }
+  }
+
+  client_lock.Unlock();
+  return 0;
+}
+
+int Client::lazyio_synchronize(int fd, off_t offset, size_t count)
+{
+  client_lock.Lock();
+  dout(3) << "op: client->lazyio_synchronize(" << fd
+          << ", " << offset << ", " << count << ")" << endl;
+  
+  assert(fh_map.count(fd));
+  Fh *f = fh_map[fd];
+  Inode *in = f->inode;
+  
+  if (f->mode & FILE_MODE_LAZY) {
+    // wait for lazy cap
+    while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+      dout(7) << " don't have lazy cap, waiting" << endl;
+      Cond cond;
+      in->waitfor_lazy.push_back(&cond);
+      cond.Wait(client_lock);
+    }
+    
+    if (g_conf.client_oc) {
+      in->fc.flush_dirty(0);       // flush to invalidate.
+      in->fc.release_clean();
+    } else {
+      // mm, nothin to do.
+    }
+  }
+  
+  client_lock.Unlock();
+  return 0;
+}
+
+
+void Client::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+  if (dest.is_mon()) {
+    // resend to a different monitor.
+    int mon = monmap->pick_mon(true);
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", resending to mon" << mon 
+            << endl;
+    messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  }
+  else if (dest.is_osd()) {
+    objecter->ms_handle_failure(m, dest, inst);
+  } 
+  else if (dest.is_mds()) {
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl;
+    // help!
+    assert(0);
+  }
+  else {
+    // client?
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", dropping" << endl;
+    delete m;
+  }
+}
+
diff --git a/branches/sage/cephmds2/client/Client.h b/branches/sage/cephmds2/client/Client.h
new file mode 100644
index 0000000000000..626176f9f9f47
--- /dev/null
+++ b/branches/sage/cephmds2/client/Client.h
@@ -0,0 +1,588 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __CLIENT_H
+#define __CLIENT_H
+
+
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "msg/Message.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "msg/SerialMessenger.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+
+//#include "msgthread.h"
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/filepath.h"
+#include "include/interval_set.h"
+
+#include "common/Mutex.h"
+
+#include "FileCache.h"
+
+// stl
+#include <set>
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#define O_LAZY 01000000
+
+
+class Filer;
+class Objecter;
+class ObjectCacher;
+
+extern class LogType client_logtype;
+extern class Logger  *client_logger;
+
+
+
+// ============================================
+// types for my local metadata cache
+/* basic structure:
+   
+ - Dentries live in an LRU loop.  they get expired based on last access.
+      see include/lru.h.  items can be bumped to "mid" or "top" of list, etc.
+ - Inode has ref count for each Fh, Dir, or Dentry that points to it.
+ - when Inode ref goes to 0, it's expired.
+ - when Dir is empty, it's removed (and it's Inode ref--)
+ 
+*/
+
+typedef int fh_t;
+
+class Dir;
+class Inode;
+
+class Dentry : public LRUObject {
+ public:
+  string  name;                      // sort of lame
+  //const char *name;
+  Dir     *dir;
+  Inode   *inode;
+  int     ref;                       // 1 if there's a dir beneath me.
+  
+  void get() { assert(ref == 0); ref++; lru_pin(); }
+  void put() { assert(ref == 1); ref--; lru_unpin(); }
+  
+  Dentry() : dir(0), inode(0), ref(0) { }
+
+  /*Dentry() : name(0), dir(0), inode(0), ref(0) { }
+  Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { 
+    name = new char[n.length()+1];
+    strcpy((char*)name, n.c_str());
+  }
+  ~Dentry() {
+    delete[] name;
+    }*/
+};
+
+class Dir {
+ public:
+  Inode    *parent_inode;  // my inode
+  //hash_map<const char*, Dentry*, hash<const char*>, eqstr> dentries;
+  hash_map<string, Dentry*> dentries;
+
+  Dir(Inode* in) { parent_inode = in; }
+
+  bool is_empty() {  return dentries.empty(); }
+};
+
+
+class InodeCap {
+ public:
+  int  caps;
+  long seq;
+  InodeCap() : caps(0), seq(0) {}
+};
+
+
+class Inode {
+ public:
+  inode_t   inode;    // the actual inode
+  time_t    valid_until;
+
+  // about the dir (if this is one!)
+  int       dir_auth;
+  set<int>    dir_contacts;
+  bool      dir_hashed, dir_replicated;
+
+  // per-mds caps
+  map<int,InodeCap> caps;            // mds -> InodeCap
+  map<int,InodeCap> stale_caps;      // mds -> cap .. stale
+
+  time_t    file_wr_mtime;   // [writers] time of last write
+  off_t     file_wr_size;    // [writers] largest offset we've written to
+  int       num_open_rd, num_open_wr, num_open_lazy;  // num readers, writers
+
+  int       ref;      // ref count. 1 for each dentry, fh that links to me.
+  Dir       *dir;     // if i'm a dir.
+  Dentry    *dn;      // if i'm linked to a dentry.
+  string    *symlink; // symlink content, if it's a symlink
+
+  // for caching i/o mode
+  FileCache fc;
+
+  // for sync i/o mode
+  int       sync_reads;   // sync reads in progress
+  int       sync_writes;  // sync writes in progress
+
+  list<Cond*>       waitfor_write;
+  list<Cond*>       waitfor_read;
+  list<Cond*>       waitfor_lazy;
+  list<Context*>    waitfor_no_read, waitfor_no_write;
+
+  void get() { 
+    ref++; 
+    //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl;
+  }
+  void put() { 
+    ref--; assert(ref >= 0); 
+    //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl;
+  }
+
+  Inode(inode_t _inode, ObjectCacher *_oc) : 
+    inode(_inode),
+    valid_until(0),
+    dir_auth(-1), dir_hashed(false), dir_replicated(false), 
+    file_wr_mtime(0), file_wr_size(0), 
+    num_open_rd(0), num_open_wr(0), num_open_lazy(0),
+    ref(0), dir(0), dn(0), symlink(0),
+    fc(_oc, _inode),
+    sync_reads(0), sync_writes(0)
+  { }
+  ~Inode() {
+    if (symlink) { delete symlink; symlink = 0; }
+  }
+
+  inodeno_t ino() { return inode.ino; }
+
+  bool is_dir() {
+    return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR;
+  }
+
+  int file_caps() {
+    int c = 0;
+    for (map<int,InodeCap>::iterator it = caps.begin();
+         it != caps.end();
+         it++)
+      c |= it->second.caps;
+    for (map<int,InodeCap>::iterator it = stale_caps.begin();
+         it != stale_caps.end();
+         it++)
+      c |= it->second.caps;
+    return c;
+  }
+
+  int file_caps_wanted() {
+    int w = 0;
+    if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE;
+    if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER;
+    if (num_open_lazy) w |= CAP_FILE_LAZYIO;
+    return w;
+  }
+
+  int authority(MDSMap *mdsmap) {
+    //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl;
+    // parent?
+    if (dn && dn->dir && dn->dir->parent_inode) {
+      // parent hashed?
+      if (dn->dir->parent_inode->dir_hashed) {
+        // hashed
+	assert(0); 
+	// fixme
+        //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(),
+	//dn->name );
+      }
+
+      if (dir_auth >= 0)
+        return dir_auth;
+      else
+        return dn->dir->parent_inode->authority(mdsmap);
+    }
+
+    if (dir_auth >= 0)
+      return dir_auth;
+
+    assert(0);    // !!!
+    return 0;
+  }
+  int dentry_authority(const char *dn,
+                       MDSMap *mdsmap) {
+    assert(0);
+    return 0;
+    //return ->hash_dentry( ino(),
+    //dn );
+  }
+  int pick_replica(MDSMap *mdsmap) {
+    // replicas?
+    if (ino() > 1ULL && dir_contacts.size()) {
+      //cout << "dir_contacts if " << dir_contacts << endl;
+      set<int>::iterator it = dir_contacts.begin();
+      if (dir_contacts.size() == 1)
+        return *it;
+      else {
+        int r = rand() % dir_contacts.size();
+        while (r--) it++;
+        return *it;
+      }
+    }
+
+    if (dir_replicated || ino() == 1) {
+      //cout << "num_mds is " << mdcluster->get_num_mds() << endl;
+      return rand() % mdsmap->get_num_mds();  // huh.. pick a random mds!
+    }
+    else
+      return authority(mdsmap);
+  }
+
+
+  // open Dir for an inode.  if it's not open, allocated it (and pin dentry in memory).
+  Dir *open_dir() {
+    if (!dir) {
+      if (dn) dn->get();      // pin dentry
+      get();
+      dir = new Dir(this);
+    }
+    return dir;
+  }
+
+};
+
+
+
+
+// file handle for any open file state
+
+struct Fh {
+  Inode    *inode;
+  off_t     pos;
+  int       mds;        // have to talk to mds we opened with (for now)
+  int       mode;       // the mode i opened the file with
+
+  bool is_lazy() { return mode & O_LAZY; }
+
+  Fh() : inode(0), pos(0), mds(0), mode(0) {}
+};
+
+
+
+
+
+// ========================================================
+// client interface
+
+class Client : public Dispatcher {
+ public:
+  
+  /* getdir result */
+  struct DirResult {
+    string path;
+    map<string,inode_t> contents;
+    map<string,inode_t>::iterator p;
+    int off;
+    int size;
+    struct dirent_plus dp;
+    struct dirent_lite dl;
+    DirResult() : p(contents.end()), off(-1), size(0) {}
+  };
+
+
+ protected:
+  Messenger *messenger;  
+  int whoami;
+  MonMap *monmap;
+  
+  // mds fake RPC
+  tid_t last_tid;
+  map<tid_t, Cond*>                mds_rpc_cond;
+  map<tid_t, class MClientReply*>  mds_rpc_reply;
+  map<tid_t, Cond*>                mds_rpc_dispatch_cond;
+
+  // cluster descriptors
+  MDSMap *mdsmap; 
+  OSDMap *osdmap;
+
+  bool   mounted;
+  bool   unmounting;
+  Cond   mount_cond;  
+
+  int    unsafe_sync_write;
+public:
+  msg_addr_t get_myaddr() { return messenger->get_myaddr(); } 
+  void hack_sync_write_safe();
+
+protected:
+  Filer                 *filer;     
+  ObjectCacher          *objectcacher;
+  Objecter              *objecter;     // (non-blocking) osd interface
+  
+  // cache
+  hash_map<inodeno_t, Inode*> inode_map;
+  Inode*                 root;
+  LRU                    lru;    // lru list of Dentry's in our local metadata cache.
+
+  // cap weirdness
+  map<inodeno_t, map<int, class MClientFileCaps*> > cap_reap_queue;  // ino -> mds -> msg .. set of (would-be) stale caps to reap
+
+
+  // file handles, etc.
+  string                 cwd;
+  interval_set<fh_t>     free_fh_set;  // unused fh's
+  hash_map<fh_t, Fh*>    fh_map;
+  
+  fh_t get_fh() {
+    fh_t fh = free_fh_set.start();
+    free_fh_set.erase(fh, 1);
+    return fh;
+  }
+  void put_fh(fh_t fh) {
+    free_fh_set.insert(fh, 1);
+  }
+
+  void mkabspath(const char *rel, string& abs) {
+    if (rel[0] == '/') {
+      abs = rel;
+    } else {
+      abs = cwd;
+      abs += "/";
+      abs += rel;
+    }
+  }
+
+
+  // global client lock
+  //  - protects Client and buffer cache both!
+  Mutex                  client_lock;
+
+
+  // -- metadata cache stuff
+
+  // decrease inode ref.  delete if dangling.
+  void put_inode(Inode *in) {
+    in->put();
+    if (in->ref == 0) {
+      inode_map.erase(in->inode.ino);
+      if (in == root) root = 0;
+      delete in;
+    }
+  }
+
+  void close_dir(Dir *dir) {
+    assert(dir->is_empty());
+    
+    Inode *in = dir->parent_inode;
+    if (in->dn) in->dn->put();   // unpin dentry
+    
+    delete in->dir;
+    in->dir = 0;
+    put_inode(in);
+  }
+
+  int get_cache_size() { return lru.lru_get_size(); }
+  void set_cache_size(int m) { lru.lru_set_max(m); }
+
+  Dentry* link(Dir *dir, const string& name, Inode *in) {
+    Dentry *dn = new Dentry;
+    dn->name = name;
+    
+    // link to dir
+    dn->dir = dir;
+    dir->dentries[dn->name] = dn;
+
+    // link to inode
+    dn->inode = in;
+    in->dn = dn;
+    in->get();
+
+    lru.lru_insert_mid(dn);    // mid or top?
+    return dn;
+  }
+
+  void unlink(Dentry *dn) {
+    Inode *in = dn->inode;
+
+    // unlink from inode
+    dn->inode = 0;
+    in->dn = 0;
+    put_inode(in);
+    
+    // unlink from dir
+    dn->dir->dentries.erase(dn->name);
+    if (dn->dir->is_empty()) 
+      close_dir(dn->dir);
+    dn->dir = 0;
+
+    // delete den
+    lru.lru_remove(dn);
+    delete dn;
+  }
+
+  Dentry *relink(Dentry *dn, Dir *dir, const string& name) {
+    // first link new dn to dir
+    /*
+    char *oldname = (char*)dn->name;
+    dn->name = new char[name.length()+1];
+    strcpy((char*)dn->name, name.c_str());
+    dir->dentries[dn->name] = dn;
+    */
+    dir->dentries[name] = dn;
+
+    // unlink from old dir
+    dn->dir->dentries.erase(dn->name);
+    //delete[] oldname;
+    if (dn->dir->is_empty()) 
+      close_dir(dn->dir);
+
+    // fix up dn
+    dn->name = name;
+    dn->dir = dir;
+
+    return dn;
+  }
+
+  // move dentry to top of lru
+  void touch_dn(Dentry *dn) { lru.lru_touch(dn); }  
+
+  // trim cache.
+  void trim_cache();
+  void dump_inode(Inode *in, set<Inode*>& did);
+  void dump_cache();  // debug
+  
+  // find dentry based on filepath
+  Dentry *lookup(filepath& path);
+
+  // make blocking mds request
+  MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1);
+  MClientReply* sendrecv(MClientRequest *req, int mds);
+  void handle_client_reply(MClientReply *reply);
+
+  void fill_stat(inode_t& inode, struct stat *st);
+  void fill_statlite(inode_t& inode, struct statlite *st);
+
+
+  // friends
+  friend class SyntheticClient;
+
+ public:
+  Client(Messenger *m, MonMap *mm);
+  ~Client();
+  void tear_down_cache();   
+
+  int get_nodeid() { return whoami; }
+
+  void init();
+  void shutdown();
+
+  // messaging
+  void dispatch(Message *m);
+
+  void handle_mount_ack(class MClientMountAck*);
+  void handle_unmount_ack(Message*);
+  void handle_mds_map(class MMDSMap *m);
+
+  // file caps
+  void handle_file_caps(class MClientFileCaps *m);
+  void implemented_caps(class MClientFileCaps *m, Inode *in);
+  void release_caps(Inode *in, int retain=0);
+  void update_caps_wanted(Inode *in);
+
+  void close_release(Inode *in);
+  void close_safe(Inode *in);
+
+  // metadata cache
+  Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn);
+  void update_inode_dist(Inode *in, InodeStat *st);
+  Inode* insert_trace(MClientReply *reply);
+
+  // ----------------------
+  // fs ops.
+  int mount(int mkfs=0);
+  int unmount();
+
+  // these shoud (more or less) mirror the actual system calls.
+  int statfs(const char *path, struct statfs *stbuf);
+
+  // crap
+  int chdir(const char *s);
+
+  // namespace ops
+  int getdir(const char *path, list<string>& contents);
+  int getdir(const char *path, map<string,inode_t>& contents);
+
+  DIR *opendir(const char *name);
+  int closedir(DIR *dir);
+  struct dirent *readdir(DIR *dir); 
+  void rewinddir(DIR *dir); 
+  off_t telldir(DIR *dir);
+  void seekdir(DIR *dir, off_t offset);
+
+  struct dirent_plus *readdirplus(DIR *dirp);
+  int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result);
+  struct dirent_lite *readdirlite(DIR *dirp);
+  int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result);
+ 
+
+  int link(const char *existing, const char *newname);
+  int unlink(const char *path);
+  int rename(const char *from, const char *to);
+
+  // dirs
+  int mkdir(const char *path, mode_t mode);
+  int rmdir(const char *path);
+
+  // symlinks
+  int readlink(const char *path, char *buf, off_t size);
+  int symlink(const char *existing, const char *newname);
+
+  // inode stuff
+  int _lstat(const char *path, int mask, Inode **in);
+  int lstat(const char *path, struct stat *stbuf);
+  int lstatlite(const char *path, struct statlite *buf);
+
+  int chmod(const char *path, mode_t mode);
+  int chown(const char *path, uid_t uid, gid_t gid);
+  int utime(const char *path, struct utimbuf *buf);
+  
+  // file ops
+  int mknod(const char *path, mode_t mode);
+  int open(const char *path, int mode);
+  int close(fh_t fh);
+  int read(fh_t fh, char *buf, off_t size, off_t offset=-1);
+  int write(fh_t fh, const char *buf, off_t size, off_t offset=-1);
+  int truncate(const char *file, off_t size);
+    //int truncate(fh_t fh, long long size);
+  int fsync(fh_t fh, bool syncdataonly);
+
+  // hpc lazyio
+  int lazyio_propogate(int fd, off_t offset, size_t count);
+  int lazyio_synchronize(int fd, off_t offset, size_t count);
+
+  int describe_layout(char *fn, list<ObjectExtent>& result);
+
+  void ms_handle_failure(Message*, msg_addr_t dest, const entity_inst_t& inst);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/client/FileCache.cc b/branches/sage/cephmds2/client/FileCache.cc
new file mode 100644
index 0000000000000..36b28dc600391
--- /dev/null
+++ b/branches/sage/cephmds2/client/FileCache.cc
@@ -0,0 +1,171 @@
+
+#include "config.h"
+#include "include/types.h"
+
+#include "FileCache.h"
+#include "osdc/ObjectCacher.h"
+
+#include "msg/Messenger.h"
+
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
+#define derr(x)  if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
+
+
+// flush/release/clean
+
+void FileCache::flush_dirty(Context *onflush)
+{
+  if (oc->flush_set(inode.ino, onflush)) {
+    onflush->finish(0);
+    delete onflush;
+  }
+}
+
+off_t FileCache::release_clean()
+{
+  return oc->release_set(inode.ino);
+}
+
+bool FileCache::is_cached()
+{
+  return oc->set_is_cached(inode.ino);
+}
+
+bool FileCache::is_dirty() 
+{
+  return oc->set_is_dirty_or_committing(inode.ino);
+}
+
+void FileCache::empty(Context *onempty)
+{
+  off_t unclean = release_clean();
+  bool clean = oc->flush_set(inode.ino, onempty);
+  assert(!unclean == clean);
+
+  if (clean) {
+    onempty->finish(0);
+    delete onempty;
+  }
+}
+
+
+// caps
+
+void FileCache::set_caps(int caps, Context *onimplement) 
+{
+  if (onimplement) {
+    assert(latest_caps & ~caps);  // we should be losing caps.
+    caps_callbacks[caps].push_back(onimplement);
+  }
+  
+  latest_caps = caps;
+  check_caps();  
+}
+
+
+void FileCache::check_caps()
+{
+  int used = 0;
+  if (num_reading) used |= CAP_FILE_RD;
+  if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE;
+  if (num_writing) used |= CAP_FILE_WR;
+  if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER;
+  dout(10) << "check_caps used " << cap_string(used) << endl;
+
+  // check callbacks
+  map<int, list<Context*> >::iterator p = caps_callbacks.begin();
+  while (p != caps_callbacks.end()) {
+    if (used == 0 || (~(p->first) & used)) {
+      // implemented.
+      dout(10) << "used is " << cap_string(used) 
+               << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl;
+      finish_contexts(p->second);
+      map<int, list<Context*> >::iterator o = p;
+      p++;
+      caps_callbacks.erase(o);
+    } else {
+      dout(10) << "used is " << cap_string(used) 
+               << ", caps " << cap_string(p->first) << " not yet implemented" << endl;
+      p++;
+    }
+  }
+}
+
+
+
+// read/write
+
+int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock)
+{
+  int r = 0;
+
+  // inc reading counter
+  num_reading++;
+  
+  if (latest_caps & CAP_FILE_RDCACHE) {
+    // read (and block)
+    Cond cond;
+    bool done = false;
+    int rvalue = 0;
+    C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
+    
+    r = oc->file_read(inode, offset, size, &blist, onfinish);
+    
+    if (r == 0) {
+      // block
+      while (!done) 
+        cond.Wait(client_lock);
+      r = rvalue;
+    } else {
+      // it was cached.
+      delete onfinish;
+    }
+  } else {
+    r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock);
+  }
+
+  // dec reading counter
+  num_reading--;
+
+  if (num_reading == 0 && !caps_callbacks.empty()) 
+    check_caps();
+
+  return r;
+}
+
+void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock)
+{
+  // inc writing counter
+  num_writing++;
+
+  if (latest_caps & CAP_FILE_WRBUFFER) {   // caps buffered write?
+    // wait? (this may block!)
+    oc->wait_for_write(size, client_lock);
+
+    // async, caching, non-blocking.
+    oc->file_write(inode, offset, size, blist);
+  } else {
+    // atomic, synchronous, blocking.
+    oc->file_atomic_sync_write(inode, offset, size, blist, client_lock);
+  }    
+    
+  // dec writing counter
+  num_writing--;
+  if (num_writing == 0 && !caps_callbacks.empty())
+    check_caps();
+}
+
+bool FileCache::all_safe()
+{
+  return !oc->set_is_dirty_or_committing(inode.ino);
+}
+
+void FileCache::add_safe_waiter(Context *c) 
+{
+  bool safe = oc->commit_set(inode.ino, c);
+  if (safe) {
+    c->finish(0);
+    delete c;
+  }
+}
diff --git a/branches/sage/cephmds2/client/FileCache.h b/branches/sage/cephmds2/client/FileCache.h
new file mode 100644
index 0000000000000..742ec98733d9b
--- /dev/null
+++ b/branches/sage/cephmds2/client/FileCache.h
@@ -0,0 +1,65 @@
+#ifndef __FILECACHE_H
+#define __FILECACHE_H
+
+#include <iostream>
+using namespace std;
+
+#include "common/Cond.h"
+#include "mds/Capability.h"
+
+class ObjectCacher;
+
+class FileCache {
+  ObjectCacher *oc;
+  inode_t inode;
+  
+  // caps
+  int latest_caps;
+  map<int, list<Context*> > caps_callbacks;
+
+  int num_reading;
+  int num_writing;
+  //int num_unsafe;
+
+  // waiters
+  list<Cond*> waitfor_read;
+  list<Cond*> waitfor_write;
+  //list<Context*> waitfor_safe;
+  bool waitfor_release;
+
+ public:
+  FileCache(ObjectCacher *_oc, inode_t _inode) : 
+    oc(_oc), 
+    inode(_inode),
+    latest_caps(0),
+    num_reading(0), num_writing(0),// num_unsafe(0),
+    waitfor_release(false) {}
+
+  // waiters/waiting
+  bool can_read() { return latest_caps & CAP_FILE_RD; }
+  bool can_write() { return latest_caps & CAP_FILE_WR; }
+  bool all_safe();// { return num_unsafe == 0; }
+
+  void add_read_waiter(Cond *c) { waitfor_read.push_back(c); }
+  void add_write_waiter(Cond *c) { waitfor_write.push_back(c); }
+  void add_safe_waiter(Context *c);// { waitfor_safe.push_back(c); }
+
+  // ...
+  void flush_dirty(Context *onflush=0);
+  off_t release_clean();
+  void empty(Context *onempty=0);
+  bool is_empty() { return !(is_cached() || is_dirty()); }
+  bool is_cached();
+  bool is_dirty();  
+
+  int get_caps() { return latest_caps; }
+  void set_caps(int caps, Context *onimplement=0);
+  void check_caps();
+  
+  int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock);  // may block.
+  void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock);  // may block.
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc
new file mode 100644
index 0000000000000..b0569d52e553e
--- /dev/null
+++ b/branches/sage/cephmds2/client/SyntheticClient.cc
@@ -0,0 +1,1226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <iostream>
+using namespace std;
+
+
+
+#include "SyntheticClient.h"
+
+#include "include/filepath.h"
+#include "mds/MDS.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <utime.h>
+#include <math.h>
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " "
+
+// traces
+//void trace_include(SyntheticClient *syn, Client *cl, string& prefix);
+//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix);
+
+
+list<int> syn_modes;
+list<int> syn_iargs;
+list<string> syn_sargs;
+
+void parse_syn_options(vector<char*>& args)
+{
+  vector<char*> nargs;
+
+  for (unsigned i=0; i<args.size(); i++) {
+    if (strcmp(args[i],"--syn") == 0) {
+      ++i;
+
+      if (strcmp(args[i],"writefile") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"wrshared") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_WRSHARED );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"writebatch") == 0) {
+          syn_modes.push_back( SYNCLIENT_MODE_WRITEBATCH );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"readfile") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_READFILE );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"rw") == 0) {
+        int a = atoi(args[++i]);
+        int b = atoi(args[++i]);
+        syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE );
+        syn_iargs.push_back( a );
+        syn_iargs.push_back( b );
+        syn_modes.push_back( SYNCLIENT_MODE_READFILE );
+        syn_iargs.push_back( a );
+        syn_iargs.push_back( b );
+
+      } else if (strcmp(args[i],"makedirs") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRS );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"statdirs") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_STATDIRS );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"readdirs") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_READDIRS );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+
+      } else if (strcmp(args[i],"makefiles") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"createshared") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_CREATESHARED );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"openshared") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_OPENSHARED );
+        syn_iargs.push_back( atoi(args[++i]) );
+        syn_iargs.push_back( atoi(args[++i]) );
+
+      } else if (strcmp(args[i],"fullwalk") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_FULLWALK );
+        //syn_sargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"randomwalk") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_RANDOMWALK );
+        syn_iargs.push_back( atoi(args[++i]) );       
+
+      } else if (strcmp(args[i],"trace") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_TRACE );
+        syn_sargs.push_back( args[++i] );
+        syn_iargs.push_back( atoi(args[++i]) );
+
+      } else if (strcmp(args[i],"until") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_UNTIL );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"sleepuntil") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_SLEEPUNTIL );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"only") == 0) {
+        syn_modes.push_back( SYNCLIENT_MODE_ONLY );
+        syn_iargs.push_back( atoi(args[++i]) );
+        
+      } else if (strcmp(args[i],"sleep") == 0) { 
+        syn_modes.push_back( SYNCLIENT_MODE_SLEEP );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"randomsleep") == 0) { 
+        syn_modes.push_back( SYNCLIENT_MODE_RANDOMSLEEP );
+        syn_iargs.push_back( atoi(args[++i]) );
+
+      } else if (strcmp(args[i],"opentest") == 0) { 
+        syn_modes.push_back( SYNCLIENT_MODE_OPENTEST );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else if (strcmp(args[i],"optest") == 0) {
+	syn_modes.push_back( SYNCLIENT_MODE_OPTEST );
+        syn_iargs.push_back( atoi(args[++i]) );
+      } else {
+        cerr << "unknown syn arg " << args[i] << endl;
+        assert(0);
+      }
+    }
+    else {
+      nargs.push_back(args[i]);
+    }
+  }
+
+  args = nargs;
+}
+
+
+SyntheticClient::SyntheticClient(Client *client) 
+{
+  this->client = client;
+  thread_id = 0;
+  
+  did_readdir = false;
+
+  run_only = -1;
+
+  this->modes = syn_modes;
+  this->iargs = syn_iargs;
+  this->sargs = syn_sargs;
+
+  run_start = g_clock.now();
+}
+
+
+
+
+#define DBL 2
+
+void *synthetic_client_thread_entry(void *ptr)
+{
+  SyntheticClient *sc = (SyntheticClient*)ptr;
+  //int r = 
+  sc->run();
+  return 0;//(void*)r;
+}
+
+string SyntheticClient::get_sarg(int seq) 
+{
+  string a;
+  if (!sargs.empty()) {
+    a = sargs.front(); 
+    sargs.pop_front();
+  }
+  if (a.length() == 0 || a == "~") {
+    char s[20];
+    sprintf(s,"syn.%d.%d", client->whoami, seq);
+    a = s;
+  } 
+  //cout << "a is " << a << endl;
+  return a;
+}
+
+int SyntheticClient::run()
+{ 
+  //run_start = g_clock.now();
+  run_until = utime_t(0,0);
+  dout(5) << "run" << endl;
+
+  for (list<int>::iterator it = modes.begin();
+       it != modes.end();
+       it++) {
+    int mode = *it;
+    dout(3) << "mode " << mode << endl;
+
+    switch (mode) {
+    case SYNCLIENT_MODE_RANDOMSLEEP:
+      {
+        int iarg1 = iargs.front();
+        iargs.pop_front();
+        if (run_me()) {
+          srand(time(0) + getpid() + client->whoami);
+          sleep(rand() % iarg1);
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_SLEEP:
+      {
+        int iarg1 = iargs.front();
+        iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "sleep " << iarg1 << endl;
+          sleep(iarg1);
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_ONLY:
+      {
+        run_only = iargs.front();
+        iargs.pop_front();
+        if (run_only == client->get_nodeid())
+          dout(2) << "only " << run_only << endl;
+      }
+      break;
+
+    case SYNCLIENT_MODE_UNTIL:
+      {
+        int iarg1 = iargs.front();
+        iargs.pop_front();
+        if (iarg1) {
+          dout(2) << "until " << iarg1 << endl;
+          utime_t dur(iarg1,0);
+          run_until = run_start + dur;
+        } else {
+          dout(2) << "until " << iarg1 << " (no limit)" << endl;
+          run_until = utime_t(0,0);
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_SLEEPUNTIL:
+      {
+        int iarg1 = iargs.front();
+        iargs.pop_front();
+        if (iarg1) {
+          dout(2) << "sleepuntil " << iarg1 << endl;
+          utime_t at = g_clock.now() - run_start;
+          if (at.sec() < iarg1) 
+            sleep(iarg1 - at.sec());
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_RANDOMWALK:
+      {
+        int iarg1 = iargs.front();
+        iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "randomwalk " << iarg1 << endl;
+          random_walk(iarg1);
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_MAKEDIRS:
+      {
+        string sarg1 = get_sarg(0);
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        int iarg3 = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+          make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_STATDIRS:
+      {
+        string sarg1 = get_sarg(0);
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        int iarg3 = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+          stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_READDIRS:
+      {
+        string sarg1 = get_sarg(0);
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        int iarg3 = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+          read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+        }
+      }
+      break;
+
+
+    case SYNCLIENT_MODE_MAKEFILES:
+      {
+        int num = iargs.front();  iargs.pop_front();
+        int count = iargs.front();  iargs.pop_front();
+        int priv = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "makefiles " << num << " " << count << " " << priv << endl;
+          make_files(num, count, priv, false);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_MAKEFILES2:
+      {
+        int num = iargs.front();  iargs.pop_front();
+        int count = iargs.front();  iargs.pop_front();
+        int priv = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl;
+          make_files(num, count, priv, true);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_CREATESHARED:
+      {
+        string sarg1 = get_sarg(0);
+        int num = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "createshared " << num << endl;
+          create_shared(num);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_OPENSHARED:
+      {
+        string sarg1 = get_sarg(0);
+        int num = iargs.front();  iargs.pop_front();
+        int count = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          dout(2) << "openshared " << num << endl;
+          open_shared(num, count);
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_FULLWALK:
+      {
+        string sarg1 = get_sarg(0);
+        if (run_me()) {
+          dout(2) << "fullwalk" << sarg1 << endl;
+          full_walk(sarg1);
+        }
+      }
+      break;
+    case SYNCLIENT_MODE_REPEATWALK:
+      {
+        string sarg1 = get_sarg(0);
+        if (run_me()) {
+          dout(2) << "repeatwalk " << sarg1 << endl;
+          while (full_walk(sarg1) == 0) ;
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_WRITEFILE:
+      {
+        string sarg1 = get_sarg(0);
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        if (run_me())
+          write_file(sarg1, iarg1, iarg2);
+      }
+      break;
+    case SYNCLIENT_MODE_WRSHARED:
+      {
+        string sarg1 = "shared";
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        if (run_me())
+          write_file(sarg1, iarg1, iarg2);
+      }
+      break;
+    case SYNCLIENT_MODE_WRITEBATCH:
+      {
+          int iarg1 = iargs.front(); iargs.pop_front();
+        int iarg2 = iargs.front(); iargs.pop_front();
+        int iarg3 = iargs.front(); iargs.pop_front();
+
+        if (run_me())
+          write_batch(iarg1, iarg2, iarg3);
+      }
+      break;
+
+    case SYNCLIENT_MODE_READFILE:
+      {
+        string sarg1 = get_sarg(0);
+        int iarg1 = iargs.front();  iargs.pop_front();
+        int iarg2 = iargs.front();  iargs.pop_front();
+        if (run_me())
+          read_file(sarg1, iarg1, iarg2);
+      }
+      break;
+
+    case SYNCLIENT_MODE_TRACE:
+      {
+        string tfile = get_sarg(0);
+        sargs.push_front(string("~"));
+        int iarg1 = iargs.front();  iargs.pop_front();
+        string prefix = get_sarg(0);
+
+        if (run_me()) {
+          dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl;
+          
+          Trace t(tfile.c_str());
+          
+          client->mkdir(prefix.c_str(), 0755);
+          
+          for (int i=0; i<iarg1; i++) {
+            utime_t start = g_clock.now();
+            
+            if (time_to_stop()) break;
+            play_trace(t, prefix);
+            if (time_to_stop()) break;
+            clean_dir(prefix);
+            
+            utime_t lat = g_clock.now();
+            lat -= start;
+            
+            dout(1) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << endl;
+            if (client_logger 
+                && i > 0
+                && i < iarg1-1
+                ) {
+              client_logger->finc("trsum", (double)lat);
+              client_logger->inc("trnum");
+            }
+          }
+        }
+      }
+      break;
+
+
+    case SYNCLIENT_MODE_OPENTEST:
+      {
+        int count = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+          for (int i=0; i<count; i++) {
+            int fd = client->open("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY);
+            if (fd > 0) client->close(fd);
+          }
+        }
+      }
+      break;
+
+    case SYNCLIENT_MODE_OPTEST:
+      {
+        int count = iargs.front();  iargs.pop_front();
+        if (run_me()) {
+	  client->mknod("test",0777);
+	  struct stat st;
+	  for (int i=0; i<count; i++) {
+	    client->lstat("test", &st);
+	    client->chmod("test", 0777);
+          }
+        }
+      }
+      break;
+      
+    default:
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+int SyntheticClient::start_thread()
+{
+  assert(!thread_id);
+
+  pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this);
+  assert(thread_id);
+  return 0;
+}
+
+int SyntheticClient::join_thread()
+{
+  assert(thread_id);
+  void *rv;
+  pthread_join(thread_id, &rv);
+  return 0;
+}
+
+
+bool roll_die(float p) 
+{
+  float r = (float)(rand() % 100000) / 100000.0;
+  if (r < p) 
+    return true;
+  else 
+    return false;
+}
+
+void SyntheticClient::init_op_dist()
+{
+  op_dist.clear();
+  op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat );
+  op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime );
+  op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod );
+  op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown );
+
+  op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir );
+  op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod );
+  op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link );
+  op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink );
+  op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename );
+
+  op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir );
+  op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir );
+  op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink );
+
+  op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd );
+  //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read );
+  //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write );
+  op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate );
+  op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync );
+  op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close );  // actually, close()
+  op_dist.normalize();
+}
+
+void SyntheticClient::up()
+{
+  cwd = cwd.prefixpath(cwd.depth()-1);
+  dout(DBL) << "cd .. -> " << cwd << endl;
+  clear_dir();
+}
+
+
+int SyntheticClient::play_trace(Trace& t, string& prefix)
+{
+  dout(4) << "play trace" << endl;
+  t.start();
+
+  utime_t start = g_clock.now();
+
+  const char *p = prefix.c_str();
+
+  map<__int64_t, __int64_t> open_files;
+
+  while (!t.end()) {
+    
+    if (time_to_stop()) break;
+    
+    // op
+    const char *op = t.get_string();
+    dout(4) << "trace op " << op << endl;
+    if (strcmp(op, "link") == 0) {
+      const char *a = t.get_string(p);
+      const char *b = t.get_string(p);
+      client->link(a,b);      
+    } else if (strcmp(op, "unlink") == 0) {
+      const char *a = t.get_string(p);
+      client->unlink(a);
+    } else if (strcmp(op, "rename") == 0) {
+      const char *a = t.get_string(p);
+      const char *b = t.get_string(p);
+      client->rename(a,b);      
+    } else if (strcmp(op, "mkdir") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      client->mkdir(a, b);
+    } else if (strcmp(op, "rmdir") == 0) {
+      const char *a = t.get_string(p);
+      client->rmdir(a);
+    } else if (strcmp(op, "symlink") == 0) {
+      const char *a = t.get_string(p);
+      const char *b = t.get_string(p);
+      client->symlink(a,b);      
+    } else if (strcmp(op, "readlink") == 0) {
+      const char *a = t.get_string(p);
+      char buf[100];
+      client->readlink(a, buf, 100);
+    } else if (strcmp(op, "lstat") == 0) {
+      struct stat st;
+      const char *a = t.get_string(p);
+      client->lstat(a, &st);
+    } else if (strcmp(op, "chmod") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      client->chmod(a, b);
+    } else if (strcmp(op, "chown") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      __int64_t c = t.get_int();
+      client->chown(a, b, c);
+    } else if (strcmp(op, "utime") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      __int64_t c = t.get_int();
+      struct utimbuf u;
+      u.actime = b;
+      u.modtime = c;
+      client->utime(a, &u);
+    } else if (strcmp(op, "mknod") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      client->mknod(a, b);
+    } else if (strcmp(op, "getdir") == 0) {
+      const char *a = t.get_string(p);
+      map<string,inode_t> contents;
+      client->getdir(a, contents);
+    } else if (strcmp(op, "open") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int(); 
+      __int64_t id = t.get_int();
+      __int64_t fh = client->open(a, b);
+      open_files[id] = fh;
+    } else if (strcmp(op, "close") == 0) {
+      __int64_t id = t.get_int();
+      __int64_t fh = open_files[id];
+      if (fh > 0) client->close(fh);
+      open_files.erase(id);
+    } else if (strcmp(op, "truncate") == 0) {
+      const char *a = t.get_string(p);
+      __int64_t b = t.get_int();
+      client->truncate(a,b);
+    } else if (strcmp(op, "read") == 0) {
+      __int64_t id = t.get_int();
+      __int64_t fh = open_files[id];
+      int size = t.get_int();
+      int off = t.get_int();
+      char *buf = new char[size];
+      client->read(fh, buf, size, off);
+      delete[] buf;
+    } else if (strcmp(op, "write") == 0) {
+      __int64_t id = t.get_int();
+      __int64_t fh = open_files[id];
+      int size = t.get_int();
+      int off = t.get_int();
+      char *buf = new char[size];
+      memset(buf, 1, size);            // let's write 1's!
+      client->write(fh, buf, size, off);
+      delete[] buf;
+    } else if (strcmp(op, "fsync") == 0) {
+      assert(0);
+    } else 
+      assert(0);
+  }
+
+  // close open files
+  for (map<__int64_t, __int64_t>::iterator fi = open_files.begin();
+       fi != open_files.end();
+       fi++) {
+    dout(1) << "leftover close " << fi->second << endl;
+    if (fi->second > 0) client->close(fi->second);
+  }
+  
+  return 0;
+}
+
+
+int SyntheticClient::clean_dir(string& basedir)
+{
+  // read dir
+  map<string, inode_t> contents;
+  int r = client->getdir(basedir.c_str(), contents);
+  if (r < 0) {
+    dout(1) << "readdir on " << basedir << " returns " << r << endl;
+    return r;
+  }
+
+  for (map<string, inode_t>::iterator it = contents.begin();
+       it != contents.end();
+       it++) {
+    string file = basedir + "/" + it->first;
+
+    if (time_to_stop()) break;
+
+    struct stat st;
+    int r = client->lstat(file.c_str(), &st);
+    if (r < 0) {
+      dout(1) << "stat error on " << file << " r=" << r << endl;
+      continue;
+    }
+
+    if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) {
+      clean_dir(file);
+      client->rmdir(file.c_str());
+    } else {
+      client->unlink(file.c_str());
+    }
+  }
+
+  return 0;
+
+}
+
+
+int SyntheticClient::full_walk(string& basedir) 
+{
+  if (time_to_stop()) return -1;
+
+  // read dir
+  map<string, inode_t> contents;
+  int r = client->getdir(basedir.c_str(), contents);
+  if (r < 0) {
+    dout(1) << "readdir on " << basedir << " returns " << r << endl;
+    return r;
+  }
+
+  for (map<string, inode_t>::iterator it = contents.begin();
+       it != contents.end();
+       it++) {
+    string file = basedir + "/" + it->first;
+
+    struct stat st;
+    int r = client->lstat(file.c_str(), &st);
+    if (r < 0) {
+      dout(1) << "stat error on " << file << " r=" << r << endl;
+      continue;
+    }
+
+    if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file);
+  }
+
+  return 0;
+}
+
+int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth)
+{
+  if (time_to_stop()) return 0;
+
+  // make sure base dir exists
+  int r = client->mkdir(basedir, 0755);
+  if (r != 0) {
+    dout(1) << "can't make base dir? " << basedir << endl;
+    return -1;
+  }
+
+  // children
+  char d[500];
+  dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+  for (int i=0; i<files; i++) {
+    sprintf(d,"%s/file.%d", basedir, i);
+    client->mknod(d, 0644);
+  }
+
+  if (depth == 0) return 0;
+
+  for (int i=0; i<dirs; i++) {
+    sprintf(d, "%s/dir.%d", basedir, i);
+    make_dirs(d, dirs, files, depth-1);
+  }
+  
+  return 0;
+}
+
+int SyntheticClient::stat_dirs(const char *basedir, int dirs, int files, int depth)
+{
+  if (time_to_stop()) return 0;
+
+  // make sure base dir exists
+  struct stat st;
+  int r = client->lstat(basedir, &st);
+  if (r != 0) {
+    dout(1) << "can't make base dir? " << basedir << endl;
+    return -1;
+  }
+
+  // children
+  char d[500];
+  dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+  for (int i=0; i<files; i++) {
+    sprintf(d,"%s/file.%d", basedir, i);
+    client->lstat(d, &st);
+  }
+
+  if (depth == 0) return 0;
+
+  for (int i=0; i<dirs; i++) {
+    sprintf(d, "%s/dir.%d", basedir, i);
+    stat_dirs(d, dirs, files, depth-1);
+  }
+  
+  return 0;
+}
+int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int depth)
+{
+  if (time_to_stop()) return 0;
+
+  struct stat st;
+
+  // children
+  char d[500];
+  dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+
+  map<string,inode_t> contents;
+  utime_t s = g_clock.now();
+  int r = client->getdir(basedir, contents);
+  utime_t e = g_clock.now();
+  e -= s;
+  if (client_logger) client_logger->finc("readdir", e);
+  if (r < 0) {
+    dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl;
+    return -1;
+  }
+
+  for (int i=0; i<files; i++) {
+    sprintf(d,"%s/file.%d", basedir, i);
+    utime_t s = g_clock.now();
+    if (client->lstat(d, &st) < 0) {
+      dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl;
+      return -1;
+    }
+    utime_t e = g_clock.now();
+    e -= s;
+    if (client_logger) client_logger->finc("stat", e);
+  }
+
+  if (depth > 0) 
+    for (int i=0; i<dirs; i++) {
+      sprintf(d, "%s/dir.%d", basedir, i);
+      if (read_dirs(d, dirs, files, depth-1) < 0) return -1;
+    }
+
+  return 0;
+}
+
+
+int SyntheticClient::make_files(int num, int count, int priv, bool more)
+{
+  int whoami = client->get_nodeid();
+  char d[255];
+
+  if (priv) {
+    for (int c=0; c<count; c++) {
+      sprintf(d,"dir.%d.run%d", whoami, c);
+      client->mkdir(d, 0755);
+    }
+  } else {
+    // shared
+    if (whoami == 0) {
+      for (int c=0; c<count; c++) {
+        sprintf(d,"dir.%d.run%d", 0, c);
+        client->mkdir(d, 0755);
+      }
+    } else {
+      sleep(5);
+    }
+  }
+  
+  // files
+  struct stat st;
+  for (int c=0; c<count; c++) {
+    for (int n=0; n<num; n++) {
+      sprintf(d,"dir.%d.run%d/file.client%d.%d", priv ? whoami:0, c, whoami, n);
+
+      client->mknod(d, 0644);
+
+      if (more) {
+        client->lstat(d, &st);
+        int fd = client->open(d, O_RDONLY);
+        client->unlink(d);
+        client->close(fd);
+      }
+
+      if (time_to_stop()) return 0;
+    }
+  }
+  
+  return 0;
+}
+
+
+int SyntheticClient::create_shared(int num)
+{
+  // files
+  char d[255];
+  for (int n=0; n<num; n++) {
+    sprintf(d,"file.%d", n);
+    client->mknod(d, 0644);
+  }
+  
+  return 0;
+}
+
+int SyntheticClient::open_shared(int num, int count)
+{
+  // files
+  char d[255];
+  for (int c=0; c<count; c++) {
+    // open
+    list<int> fds;
+    for (int n=0; n<num; n++) {
+      sprintf(d,"file.%d", n);
+      int fd = client->open(d,O_RDONLY);
+      fds.push_back(fd);
+    }
+
+    while (!fds.empty()) {
+      int fd = fds.front();
+      fds.pop_front();
+      client->close(fd);
+    }
+  }
+  
+  return 0;
+}
+
+
+int SyntheticClient::write_file(string& fn, int size, int wrsize)   // size is in MB, wrsize in bytes
+{
+  //__uint64_t wrsize = 1024*256;
+  char *buf = new char[wrsize+100];   // 1 MB
+  memset(buf, 7, wrsize);
+  __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)wrsize;
+
+  int fd = client->open(fn.c_str(), O_RDWR|O_CREAT);
+  dout(5) << "writing to " << fn << " fd " << fd << endl;
+  if (fd < 0) return fd;
+
+  for (unsigned i=0; i<chunks; i++) {
+    if (time_to_stop()) {
+      dout(0) << "stopping" << endl;
+      break;
+    }
+    dout(2) << "writing block " << i << "/" << chunks << endl;
+    
+    // fill buf with a fingerprint
+    int *p = (int*)buf;
+    while ((char*)p < buf + wrsize) {
+      *p = (char*)p - buf;      
+      p++;
+      *p = i;
+      p++;
+      *p = client->get_nodeid();
+      p++;
+      *p = 0;
+      p++;
+    }
+
+    client->write(fd, buf, wrsize, i*wrsize);
+  }
+  
+  client->close(fd);
+  delete[] buf;
+
+  return 0;
+}
+
+int SyntheticClient::write_batch(int nfile, int size, int wrsize)
+{
+  for (int i=0; i<nfile; i++) {
+      string sarg1 = get_sarg(i);
+    dout(0) << "Write file " << sarg1 << endl;
+    write_file(sarg1, size, wrsize);
+  }
+  return 0;
+}
+
+int SyntheticClient::read_file(string& fn, int size, int rdsize)   // size is in MB, wrsize in bytes
+{
+  char *buf = new char[rdsize]; 
+  memset(buf, 1, rdsize);
+  __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize;
+
+  int fd = client->open(fn.c_str(), O_RDONLY);
+  dout(5) << "reading from " << fn << " fd " << fd << endl;
+  if (fd < 0) return fd;
+
+  for (unsigned i=0; i<chunks; i++) {
+    if (time_to_stop()) break;
+    dout(2) << "reading block " << i << "/" << chunks << endl;
+    client->read(fd, buf, rdsize, i*rdsize);
+
+    // verify fingerprint
+    int *p = (int*)buf;
+    int bad = 0;
+    int boff, bgoff, bchunk, bclient, bzero;
+    while ((char*)p + 32 < buf + rdsize) {
+      boff = *p;
+      bgoff = (int)((char*)p - buf);
+      p++;
+      bchunk = *p;
+      p++;
+      bclient = *p;
+      p++;
+      bzero = *p;
+      p++;
+      if (boff != bgoff ||
+          bchunk != (int)i ||
+          bclient != client->get_nodeid() ||
+          bzero != 0) {
+        if (!bad)
+          dout(0) << "WARNING: wrong data from OSD, it should be " 
+                  << "(block=" << i 
+                  << " offset=" << bgoff
+                  << " client=" << client->get_nodeid() << ")"
+                  << " .. but i read back .. " 
+                  << "(block=" << bchunk
+                  << " offset=" << boff
+                  << " client=" << bclient << " zero=" << bzero << ")" << endl;
+
+        bad++;
+      }
+    }
+    if (bad) 
+      dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl;
+
+  }
+  
+  client->close(fd);
+  delete[] buf;
+
+  return 0;
+}
+
+
+
+int SyntheticClient::random_walk(int num_req)
+{
+  int left = num_req;
+
+  //dout(1) << "random_walk() will do " << left << " ops" << endl;
+
+  init_op_dist();  // set up metadata op distribution
+ 
+  while (left > 0) {
+    left--;
+
+    if (time_to_stop()) break;
+
+    // ascend?
+    if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) {
+      dout(DBL) << "die says up" << endl;
+      up();
+      continue;
+    }
+
+    // descend?
+    if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) {
+      string s = get_random_subdir();
+      cwd.add_dentry( s );
+      dout(DBL) << "cd " << s << " -> " << cwd << endl;
+      clear_dir();
+      continue;
+    }
+
+    int op = 0;
+    filepath path;
+
+    if (contents.empty() && roll_die(.3)) {
+      if (did_readdir) {
+        dout(DBL) << "empty dir, up" << endl;
+        up();
+      } else
+        op = MDS_OP_READDIR;
+    } else {
+      op = op_dist.sample();
+    }
+    //dout(DBL) << "op is " << op << endl;
+
+    int r = 0;
+
+    // do op
+    if (op == MDS_OP_UNLINK) {
+      if (contents.empty())
+        op = MDS_OP_READDIR;
+      else 
+        r = client->unlink( get_random_sub() );   // will fail on dirs
+    }
+     
+    if (op == MDS_OP_RENAME) {
+      if (contents.empty())
+        op = MDS_OP_READDIR;
+      else {
+        r = client->rename( get_random_sub(), make_sub("ren") );
+      }
+    }
+    
+    if (op == MDS_OP_MKDIR) {
+      r = client->mkdir( make_sub("mkdir"), 0755);
+    }
+    
+    if (op == MDS_OP_RMDIR) {
+      if (!subdirs.empty())
+        r = client->rmdir( get_random_subdir() );
+      else
+        r = client->rmdir( cwd.c_str() );     // will pbly fail
+    }
+    
+    if (op == MDS_OP_SYMLINK) {
+    }
+    
+    if (op == MDS_OP_CHMOD) {
+      if (contents.empty())
+        op = MDS_OP_READDIR;
+      else
+        r = client->chmod( get_random_sub(), rand() & 0755 );
+    }
+    
+    if (op == MDS_OP_CHOWN) {
+      if (contents.empty())         r = client->chown( cwd.c_str(), rand(), rand() );
+      else
+        r = client->chown( get_random_sub(), rand(), rand() );
+    }
+     
+    if (op == MDS_OP_LINK) {
+    }
+     
+    if (op == MDS_OP_UTIME) {
+      struct utimbuf b;
+      memset(&b, 1, sizeof(b));
+      if (contents.empty()) 
+        r = client->utime( cwd.c_str(), &b );
+      else
+        r = client->utime( get_random_sub(), &b );
+    }
+    
+    if (op == MDS_OP_MKNOD) {
+      r = client->mknod( make_sub("mknod"), 0644);
+    }
+     
+    if (op == MDS_OP_OPEN) {
+      if (contents.empty())
+        op = MDS_OP_READDIR;
+      else {
+        r = client->open( get_random_sub(), O_RDONLY );
+        if (r > 0) {
+          assert(open_files.count(r) == 0);
+          open_files.insert(r);
+        }
+      }
+    }
+
+    if (op == MDS_OP_RELEASE) {   // actually, close
+      if (open_files.empty())
+        op = MDS_OP_STAT;
+      else {
+        int fh = get_random_fh();
+        r = client->close( fh );
+        if (r == 0) open_files.erase(fh);
+      }
+    }
+    
+    if (op == MDS_OP_STAT) {
+      struct stat st;
+      if (contents.empty()) {
+        if (did_readdir) {
+          if (roll_die(.1)) {
+            dout(DBL) << "stat in empty dir, up" << endl;
+            up();
+          } else {
+            op = MDS_OP_MKNOD;
+          }
+        } else
+          op = MDS_OP_READDIR;
+      } else
+        r = client->lstat(get_random_sub(), &st);
+    }
+
+    if (op == MDS_OP_READDIR) {
+      clear_dir();
+      
+      map<string, inode_t> c;
+      r = client->getdir( cwd.c_str(), c );
+      
+      for (map<string, inode_t>::iterator it = c.begin();
+           it != c.end();
+           it++) {
+        //dout(DBL) << " got " << it->first << endl;
+        contents[it->first] = it->second;
+        if (it->second.is_dir()) 
+          subdirs.insert(it->first);
+      }
+      
+      did_readdir = true;
+    }
+      
+    // errors?
+    if (r < 0) {
+      // reevaluate cwd.
+      //while (cwd.depth()) {
+      //if (client->lookup(cwd)) break;   // it's in the cache
+        
+      //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl;
+      dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl;
+      up();
+      //}      
+    }
+  }
+
+  // close files
+  dout(DBL) << "closing files" << endl;
+  while (!open_files.empty()) {
+    int fh = get_random_fh();
+    int r = client->close( fh );
+    if (r == 0) open_files.erase(fh);
+  }
+
+  dout(DBL) << "done" << endl;
+  return 0;
+}
+
+
diff --git a/branches/sage/cephmds2/client/SyntheticClient.h b/branches/sage/cephmds2/client/SyntheticClient.h
new file mode 100644
index 0000000000000..14720bdd412b2
--- /dev/null
+++ b/branches/sage/cephmds2/client/SyntheticClient.h
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __SYNTHETICCLIENT_H
+#define __SYNTHETICCLIENT_H
+
+#include <pthread.h>
+
+#include "Client.h"
+#include "include/Distribution.h"
+
+#include "Trace.h"
+
+#define SYNCLIENT_MODE_RANDOMWALK  1
+#define SYNCLIENT_MODE_FULLWALK    2
+#define SYNCLIENT_MODE_REPEATWALK  7
+
+#define SYNCLIENT_MODE_MAKEDIRS     8      // dirs files depth
+#define SYNCLIENT_MODE_STATDIRS     9     // dirs files depth
+#define SYNCLIENT_MODE_READDIRS     10     // dirs files depth
+
+#define SYNCLIENT_MODE_MAKEFILES    11     // num count private
+#define SYNCLIENT_MODE_MAKEFILES2   12     // num count private
+#define SYNCLIENT_MODE_CREATESHARED 13     // num
+#define SYNCLIENT_MODE_OPENSHARED   14     // num count
+
+#define SYNCLIENT_MODE_WRITEFILE   20
+#define SYNCLIENT_MODE_READFILE    21
+#define SYNCLIENT_MODE_WRITEBATCH  22
+#define SYNCLIENT_MODE_WRSHARED    23
+
+#define SYNCLIENT_MODE_TRACE       30
+
+#define SYNCLIENT_MODE_OPENTEST     40
+#define SYNCLIENT_MODE_OPTEST       41
+
+#define SYNCLIENT_MODE_ONLY        50
+#define SYNCLIENT_MODE_UNTIL       51
+#define SYNCLIENT_MODE_SLEEPUNTIL  52
+
+#define SYNCLIENT_MODE_RANDOMSLEEP  61
+#define SYNCLIENT_MODE_SLEEP        62
+
+
+
+
+void parse_syn_options(vector<char*>& args);
+
+class SyntheticClient {
+  Client *client;
+
+  pthread_t thread_id;
+
+  Distribution op_dist;
+
+  void init_op_dist();
+  int get_op();
+
+  
+  filepath             cwd;
+  map<string, inode_t> contents;
+  set<string>          subdirs;
+  bool                 did_readdir;
+  set<int>             open_files;
+
+  void up();
+
+  void clear_dir() {
+    contents.clear();
+    subdirs.clear();
+    did_readdir = false;
+  }
+
+  int get_random_fh() {
+    int r = rand() % open_files.size();
+    set<int>::iterator it = open_files.begin();
+    while (r--) it++;
+    return *it;
+  }
+
+
+  filepath n1;
+  const char *get_random_subdir() {
+    assert(!subdirs.empty());
+    int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2;  // non-uniform distn
+    set<string>::iterator it = subdirs.begin();
+    while (r--) it++;
+
+    n1 = cwd;
+    n1.add_dentry( *it );
+    return n1.get_path().c_str();
+  }
+  filepath n2;
+  const char *get_random_sub() {
+    assert(!contents.empty());
+    int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2;  // non-uniform distn
+    if (cwd.depth() && cwd.last_bit().length()) 
+      r += cwd.last_bit().c_str()[0];                                         // slightly permuted
+    r %= contents.size();
+
+    map<string,inode_t>::iterator it = contents.begin();
+    while (r--) it++;
+
+    n2 = cwd;
+    n2.add_dentry( it->first );
+    return n2.get_path().c_str();
+  }
+  
+  filepath sub;
+  char sub_s[50];
+  const char *make_sub(char *base) {
+    sprintf(sub_s, "%s.%d", base, rand() % 100);
+    string f = sub_s;
+    sub = cwd;
+    sub.add_dentry(f);
+    return sub.c_str();
+  }
+
+ public:
+  SyntheticClient(Client *client);
+
+  int start_thread();
+  int join_thread();
+
+  int run();
+
+  bool run_me() {
+    if (run_only >= 0) {
+      if (run_only == client->get_nodeid()) {
+        run_only = -1;
+        return true;
+      }
+      run_only = -1;
+      return false;
+    }
+    return true;
+  }
+
+  // run() will do one of these things:
+  list<int> modes;
+  list<string> sargs;
+  list<int> iargs;
+  utime_t run_start;
+  utime_t run_until;
+
+  int     run_only;
+
+  string get_sarg(int seq);
+
+  bool time_to_stop() {
+    utime_t now = g_clock.now();
+    if (0) cout << "time_to_stop .. now " << now 
+         << " until " << run_until 
+         << " start " << run_start 
+         << endl;
+    if (run_until.sec() && now > run_until) 
+      return true;
+    else
+      return false;
+  }
+
+  string compose_path(string& prefix, char *rest) {
+    return prefix + rest;
+  }
+
+  int full_walk(string& fromdir);
+  int random_walk(int n);
+
+  int make_dirs(const char *basedir, int dirs, int files, int depth);
+  int stat_dirs(const char *basedir, int dirs, int files, int depth);
+  int read_dirs(const char *basedir, int dirs, int files, int depth);
+  int make_files(int num, int count, int priv, bool more);
+
+  int create_shared(int num);
+  int open_shared(int num, int count);
+
+  int write_file(string& fn, int mb, int chunk);
+  int write_batch(int nfile, int mb, int chunk);
+  int read_file(string& fn, int mb, int chunk);
+
+  int clean_dir(string& basedir);
+
+  int play_trace(Trace& t, string& prefix);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/client/Trace.cc b/branches/sage/cephmds2/client/Trace.cc
new file mode 100644
index 0000000000000..43459653011a1
--- /dev/null
+++ b/branches/sage/cephmds2/client/Trace.cc
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "Trace.h"
+
+#include <iostream>
+#include <cassert>
+#include <map>
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "common/Mutex.h"
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+Mutex trace_lock;
+
+class TokenList {
+public:
+  string filename;
+  char *data;
+  int len;
+  list<const char *> tokens;
+ 
+  int ref;
+
+  TokenList() : data(0), ref(0) {}
+  ~TokenList() {
+    delete[] data;
+  }
+};
+
+map<string, TokenList*> traces;
+
+
+//
+Trace::Trace(const char* f)
+{
+  string filename = f;
+
+  trace_lock.Lock();
+  
+  if (traces.count(filename))
+    tl = traces[filename];
+  else {
+    tl = new TokenList;
+    tl->filename = filename;
+
+    // open file
+    crope cr;
+    int fd = open(filename.c_str(), O_RDONLY);
+    assert(fd > 0);
+    char buf[100];
+    while (1) {
+      int r = read(fd, buf, 100);
+      if (r == 0) break;
+      assert(r > 0);
+      cr.append(buf, r);
+    }
+    close(fd);
+    
+    // copy
+    tl->len = cr.length()+1;
+    tl->data = new char[tl->len];
+    memcpy(tl->data, cr.c_str(), cr.length());
+    tl->data[tl->len-1] = '\n';
+
+    // index!
+    int o = 0;
+    while (o < tl->len) {
+      char *n = tl->data + o;
+      
+      // find newline
+      while (tl->data[o] != '\n') o++;
+      assert(tl->data[o] == '\n');
+      tl->data[o] = 0;
+      
+      if (tl->data + o > n) tl->tokens.push_back(n);
+      o++;
+    }
+
+    dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl;
+    traces[filename] = tl;
+  }
+
+  tl->ref++;
+
+  trace_lock.Unlock();
+}
+
+Trace::~Trace()
+{
+  trace_lock.Lock();
+  
+  tl->ref--;
+  if (tl->ref == 0) {
+    traces.erase(tl->filename);
+    delete tl;
+  }
+
+  trace_lock.Unlock();
+}
+
+
+list<const char*>& Trace::get_list() 
+{
+  return tl->tokens;
+}
diff --git a/branches/sage/cephmds2/client/Trace.h b/branches/sage/cephmds2/client/Trace.h
new file mode 100644
index 0000000000000..08b1fa8ff2722
--- /dev/null
+++ b/branches/sage/cephmds2/client/Trace.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __CLIENT_TRACE_H
+#define __CLIENT_TRACE_H
+
+#include <cassert>
+#include <list>
+#include <string>
+using namespace std;
+
+/*
+
+ this class is more like an iterator over a constant tokenlist (which 
+ is protected by a mutex, see Trace.cc)
+
+ */
+
+class Trace {
+  class TokenList *tl;
+  
+ public:
+  Trace(const char* filename);
+  ~Trace();
+  
+  list<const char*>& get_list();
+
+  list<const char*>::iterator _cur;
+  list<const char*>::iterator _end;
+
+  void start() {
+    _cur = get_list().begin();
+    _end = get_list().end();
+    ns = 0;
+  }
+
+  char strings[10][200];
+  int ns;
+  const char *get_string(const char *prefix = 0) {
+    assert(_cur != _end);
+    const char *s = *_cur;
+    _cur++;
+    if (prefix) {
+      if (strstr(s, "/prefix") == s ||
+          strstr(s, "/prefix") == s+1) {
+        strcpy(strings[ns], prefix);
+        strcpy(strings[ns] + strlen(prefix),
+               s + strlen("/prefix"));
+        s = (const char*)strings[ns];
+        ns++;
+        if (ns == 10) ns = 0;
+      }
+    } 
+    return s;
+  }
+  __int64_t get_int() {
+    return atoll(get_string());
+  }
+  bool end() {
+    return _cur == _end;
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/client/fuse.cc b/branches/sage/cephmds2/client/fuse.cc
new file mode 100644
index 0000000000000..560a515a95240
--- /dev/null
+++ b/branches/sage/cephmds2/client/fuse.cc
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+/*
+    FUSE: Filesystem in Userspace
+    Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+    This program can be distributed under the terms of the GNU GPL.
+    See the file COPYING.
+*/
+
+
+// fuse crap
+#ifdef linux
+/* For pread()/pwrite() */
+#define _XOPEN_SOURCE 500
+#endif
+
+#define FUSE_USE_VERSION 22
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/statfs.h>
+
+
+// ceph stuff
+#include "include/types.h"
+
+#include "Client.h"
+
+#include "config.h"
+
+// stl
+#include <map>
+using namespace std;
+
+
+// globals
+Client *client;     // the ceph client
+
+
+
+// ------
+// fuse hooks
+
+static int ceph_getattr(const char *path, struct stat *stbuf)
+{
+  return client->lstat(path, stbuf);
+}
+
+static int ceph_readlink(const char *path, char *buf, size_t size)
+{
+  int res;
+
+  res = client->readlink(path, buf, size - 1);
+  if (res < 0) return res;
+  
+  buf[res] = '\0';
+  return 0;
+}
+
+
+static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler)
+{
+  map<string, inode_t> contents;
+
+  int res = client->getdir(path, contents);
+  if (res < 0) return res;
+
+  // return contents to fuse via callback
+  for (map<string, inode_t>::iterator it = contents.begin();
+       it != contents.end();
+       it++) {
+    // (immutable) inode contents too.
+    res = filler(h,                                    // fuse's handle
+                 it->first.c_str(),                    // dentry as char*
+                 it->second.mode & INODE_TYPE_MASK,   // mask type bits from mode
+                 it->second.ino);                     // ino.. 64->32 bit issue here? FIXME
+    if (res != 0) break;   // fuse has had enough
+  }
+  return res;
+}
+
+static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) 
+{
+  return client->mknod(path, mode);
+}
+
+static int ceph_mkdir(const char *path, mode_t mode)
+{
+  return client->mkdir(path, mode);
+}
+
+static int ceph_unlink(const char *path)
+{
+  return client->unlink(path);
+}
+
+static int ceph_rmdir(const char *path)
+{
+  return client->rmdir(path);
+}
+
+static int ceph_symlink(const char *from, const char *to)
+{
+  return client->symlink(from, to);
+}
+
+static int ceph_rename(const char *from, const char *to)
+{
+  return client->rename(from, to);
+}
+
+static int ceph_link(const char *from, const char *to)
+{
+  return client->link(from, to);
+}
+
+static int ceph_chmod(const char *path, mode_t mode)
+{
+  return client->chmod(path, mode);
+}
+
+static int ceph_chown(const char *path, uid_t uid, gid_t gid)
+{
+  return client->chown(path, uid, gid);
+}
+
+static int ceph_truncate(const char *path, off_t size)
+{
+  return client->truncate(path, size);      
+}
+
+static int ceph_utime(const char *path, struct utimbuf *buf)
+{
+  return client->utime(path, buf);
+}
+
+
+static int ceph_open(const char *path, struct fuse_file_info *fi)
+{
+  int res;
+  
+  res = client->open(path, fi->flags);
+  if (res < 0) return res;
+  fi->fh = res;
+  return 0;  // fuse wants 0 onsucess
+}
+
+static int ceph_read(const char *path, char *buf, size_t size, off_t offset,
+                     struct fuse_file_info *fi)
+{
+  fh_t fh = fi->fh;
+  return client->read(fh, buf, size, offset);
+}
+
+static int ceph_write(const char *path, const char *buf, size_t size,
+                     off_t offset, struct fuse_file_info *fi)
+{
+  fh_t fh = fi->fh;
+  return client->write(fh, buf, size, offset);
+}
+
+/*
+static int ceph_flush(const char *path, struct fuse_file_info *fi)
+{
+  fh_t fh = fi->fh;
+  return client->flush(fh);
+}
+*/
+
+static int ceph_statfs(const char *path, struct statfs *stbuf)
+{
+  return client->statfs(path, stbuf);
+}
+
+
+
+static int ceph_release(const char *path, struct fuse_file_info *fi)
+{
+  fh_t fh = fi->fh;
+  int r = client->close(fh);  // close the file
+  return r;
+}
+
+static int ceph_fsync(const char *path, int isdatasync,
+                     struct fuse_file_info *fi)
+{
+  fh_t fh = fi->fh;
+  return client->fsync(fh, isdatasync ? true:false);
+}
+
+
+static struct fuse_operations ceph_oper = {
+  getattr: ceph_getattr,
+  readlink: ceph_readlink,
+  getdir: ceph_getdir,
+  mknod: ceph_mknod,
+  mkdir: ceph_mkdir,
+  unlink: ceph_unlink,
+  rmdir: ceph_rmdir,
+  symlink: ceph_symlink,
+  rename: ceph_rename,
+  link: ceph_link,
+  chmod: ceph_chmod,
+  chown: ceph_chown,
+  truncate: ceph_truncate,
+  utime: ceph_utime,
+  open: ceph_open,
+  read: ceph_read,
+  write: ceph_write,
+  statfs: ceph_statfs,
+  flush: 0, //ceph_flush,   
+  release: ceph_release,
+  fsync: ceph_fsync
+};
+
+
+int ceph_fuse_main(Client *c, int argc, char *argv[])
+{
+  // init client
+  client = c;
+
+  // set up fuse argc/argv
+  int newargc = 0;
+  char **newargv = (char **) malloc((argc + 10) * sizeof(char *));
+  newargv[newargc++] = argv[0];
+  
+  // allow other (all!) users to see my file system
+  // NOTE: echo user_allow_other >> /etc/fuse.conf
+  newargv[newargc++] = "-o";
+  newargv[newargc++] = "allow_other";
+  
+  // use inos
+  newargv[newargc++] = "-o";
+  newargv[newargc++] = "use_ino";
+
+  // large reads, direct_io (no kernel cachine)
+  //newargv[newargc++] = "-o";
+  //newargv[newargc++] = "large_read";
+  if (g_conf.fuse_direct_io) {
+    newargv[newargc++] = "-o";
+    newargv[newargc++] = "direct_io";
+  }
+
+  // disable stupid fuse unlink hiding thing
+  newargv[newargc++] = "-o";
+  newargv[newargc++] = "hard_remove";
+
+  // force into foreground
+  //   -> we can watch stdout this way!!
+  newargv[newargc++] = "-f";
+  
+  // copy rest of cmdline (hopefully, the mount point!)
+  for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr];
+  
+  // go fuse go
+  cout << "ok, calling fuse_main" << endl;
+  return fuse_main(newargc, newargv, &ceph_oper);
+}
diff --git a/branches/sage/cephmds2/client/fuse.h b/branches/sage/cephmds2/client/fuse.h
new file mode 100644
index 0000000000000..d0b8dcb1154f5
--- /dev/null
+++ b/branches/sage/cephmds2/client/fuse.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+/* ceph_fuse_main
+ * - start up fuse glue, attached to Client* cl.
+ * - argc, argv should include a mount point, and 
+ *   any weird fuse options you want.  by default,
+ *   we will put fuse in the foreground so that it
+ *   won't fork and we can see stdout.
+ */
+int ceph_fuse_main(Client *cl, int argc, char *argv[]);
diff --git a/branches/sage/cephmds2/client/ldceph.cc b/branches/sage/cephmds2/client/ldceph.cc
new file mode 100644
index 0000000000000..9706fd49cad99
--- /dev/null
+++ b/branches/sage/cephmds2/client/ldceph.cc
@@ -0,0 +1,297 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <iostream>
+using namespace std;
+
+// ceph stuff
+#include "config.h"
+#include "client/Client.h"
+#include "msg/TCPMessenger.h"
+
+// syscall fun
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sys/types.h>
+//#include <sys/stat.h>
+
+#define _FCNTL_H
+#include <bits/fcntl.h>
+
+#define CEPH_FD_OFF  50000
+
+
+/****** startup etc *******/
+
+class LdCeph {
+public:
+  // globals
+  bool    started;
+  char   *mount_point;
+  char   *mount_point_parent;
+  int     mount_point_len;
+
+  Client *client;
+
+  filepath fp_mount_point;
+  filepath cwd;
+  bool     cwd_above_mp, cwd_in_mp;
+
+  const char *get_ceph_path(const char *orig, char *buf) {
+    if (!started) return 0;
+
+    // relative path?    BUG: this won't catch "blah/../../asdf"
+    if (orig[0] && 
+        orig[0] != '/' && 
+        !(orig[0] == '.' && orig[1] == '.')) {
+      
+      if (cwd_in_mp) return orig;   // inside mount point, definitely ceph
+      if (!cwd_above_mp) return 0;  // not above mount point, definitely not ceph
+    
+      // relative, above mp.
+      filepath o = orig;
+      filepath p = cwd;
+      for (unsigned b = 0; b < o.depth(); b++) {
+        if (o[b] == "..")
+          p.pop_dentry();
+        else
+          p.add_dentry(o[b]);
+      }
+
+      // FIXME rewrite
+      if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) {
+        if (p.c_str()[mount_point_len] == 0) 
+          return "/";
+        if (p.c_str()[mount_point_len] == '/') {
+          strcpy(buf, p.c_str() + mount_point_len);
+          return buf;
+        }
+      }
+      return 0;
+    } else {
+      // absolute
+      if (strncmp(orig, mount_point, mount_point_len) == 0) {
+        if (orig[mount_point_len] == 0) 
+          return "/";
+        if (orig[mount_point_len] == '/')
+          return orig + mount_point_len;
+      }
+      return 0;
+    }
+  }
+
+  void refresh_cwd() {
+    char buf[255];
+    syscall(SYS_getcwd, buf, 255);
+    cwd = buf;
+    
+    if (strncmp(buf, mount_point, mount_point_len) == 0 &&
+        (buf[mount_point_len] == 0 ||
+         buf[mount_point_len] == '/'))
+      cwd_in_mp = true;
+    else {
+      if (cwd.depth() > fp_mount_point.depth())
+        cwd_above_mp = false;
+      else {
+        cwd_above_mp = true;
+        for (unsigned i=0; i<cwd.depth(); i++) {
+          if (cwd[i] != fp_mount_point[i]) {
+            cwd_above_mp = false;
+            break;
+          }
+        }
+      }
+    }
+    //cout << "refresh_cwd '" << cwd << "', above=" << cwd_above_mp << ", in=" << cwd_in_mp << endl;
+  }
+  
+  
+  LdCeph() : 
+    started(false),
+    mount_point(0), mount_point_parent(0),
+    mount_point_len(0),
+    cwd_above_mp(false), cwd_in_mp(false) {
+
+    // args
+    vector<char *> args;
+    env_to_vec(args);
+    parse_config_options(args);
+  
+
+    tcpaddr_t nsa;
+    if (tcpmessenger_findns(nsa) < 0) 
+      return;
+    tcpmessenger_init();
+    tcpmessenger_start();
+    tcpmessenger_start_rankserver(nsa);
+
+    client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));
+    client->init();
+    int r = client->mount();
+    if (r < 0) {
+      // failure
+      cerr << "ldceph init: mount failed " << r << endl;
+      delete client;
+      client = 0;
+    } else {
+      // success
+      started = true;
+      mount_point = "/ceph";
+      mount_point_parent = "/";
+      mount_point_len = 5;
+
+      fp_mount_point = mount_point;
+
+      cerr << "ldceph init: mounted on " << mount_point << " as " << client->get_myaddr() << endl;
+
+      refresh_cwd();
+    }
+  }
+  ~LdCeph() {
+    cout << "ldceph fini" << endl;
+    if (false && client) {  
+      client->unmount();
+      client->shutdown();
+      delete client;
+      client = 0;
+      tcpmessenger_wait();
+      tcpmessenger_shutdown(); 
+    }
+  }    
+
+} ldceph;
+
+
+
+/****** original functions ****/
+
+
+
+/****** captured functions ****/
+
+
+#define MYFD(f)      ((fd) > CEPH_FD_OFF && ldceph.started)
+#define TO_FD(fd)    (fd > 0 ? fd+CEPH_FD_OFF:fd)
+#define FROM_FD(fd)  (fd - CEPH_FD_OFF)
+
+extern "C" {
+  
+  // open/close
+  //int open(const char *pathname, int flags) {
+  int open(const char *pathname, int flags, mode_t mode) {
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf))
+      return TO_FD(ldceph.client->open(c, flags));
+    else
+      return syscall(SYS_open, pathname, flags, mode);
+  }
+
+  int creat(const char *pathname, mode_t mode) {
+    return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode);
+  }
+  int close(int fd) {
+    if (MYFD(fd)) 
+      return ldceph.client->close(FROM_FD(fd));
+    else
+      return syscall(SYS_close, fd);
+  }
+  
+  
+  // read/write
+  ssize_t write(int fd, const void *buf, size_t count) {
+    if (MYFD(fd)) 
+      return ldceph.client->write(FROM_FD(fd), (char*)buf, count);
+    else
+      return syscall(SYS_write, fd, buf, count);
+  }
+
+  ssize_t read(int fd, void *buf, size_t count) {
+    if (MYFD(fd)) 
+      return ldceph.client->read(FROM_FD(fd), (char*)buf, count);
+    else
+      return syscall(SYS_read, fd, buf, count);
+  }
+
+  //int fsync(int fd);
+  //int fdatasync(int fd);
+
+
+  // namespace
+  int rmdir(const char *pathname) {
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf))
+      return ldceph.client->rmdir(c);
+    else
+      return syscall(SYS_rmdir, pathname);
+  }
+  int mkdir(const char *pathname, mode_t mode) {
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf)) 
+      return ldceph.client->mkdir(c, mode);
+    else
+      return syscall(SYS_mkdir, pathname, mode);
+  }
+  int unlink(const char *pathname) {
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf))
+      return ldceph.client->unlink(c);
+    else
+      return syscall(SYS_unlink, pathname);
+  }
+
+  int stat(const char *pathname, struct stat *st) {
+    //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) {  // stoopid GLIBC
+    //struct stat *st = (struct stat*)st64;
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf))
+      return ldceph.client->lstat(c, st);   // FIXME
+    else
+      return syscall(SYS_stat, pathname, st);
+  }
+  //int fstat(int filedes, struct stat *buf);
+  //int lstat(const char *file_name, struct stat *buf);
+
+  int chdir(const char *pathname) {
+    char buf[255];
+    if (const char *c = ldceph.get_ceph_path(pathname, buf)) {
+      int r = ldceph.client->chdir(c);
+      if (r == 0) {
+        if (!ldceph.cwd_in_mp)
+          syscall(SYS_chdir, ldceph.mount_point_parent);
+        ldceph.cwd_in_mp = true;
+        ldceph.cwd_above_mp = false;
+        ldceph.cwd = ldceph.mount_point;
+        filepath fpc = c;
+        ldceph.cwd.append(fpc);
+      }
+      return r;
+    } else {
+      int r = syscall(SYS_chdir, pathname);
+      if (r) {
+        ldceph.refresh_cwd();
+      }
+      return r;
+    }
+  }
+  char *getcwd(char *buf, size_t size) {
+    strncpy(buf, ldceph.cwd.c_str(), size);
+    return buf;
+  }
+  //int fchdir(int fd);
+
+  
+
+
+}
diff --git a/branches/sage/cephmds2/client/msgthread.h b/branches/sage/cephmds2/client/msgthread.h
new file mode 100644
index 0000000000000..69d10be9f6a56
--- /dev/null
+++ b/branches/sage/cephmds2/client/msgthread.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "msg/Message.h"
+
+// send the message, expecting no response.  threads other than the
+// MPI thread use this function; if the MPI thread uses this function
+// it could deadlock: this function could wait for the out queue to be
+// emptied, but only the MPI thread can empty it.
+void obfsmpi_send(Message *m)
+
+// send the message to a server and wait for the response.  threads
+// other than the MPI thread use this function.
+Message *obfsmpi_sendrecv(Message *m)
diff --git a/branches/sage/cephmds2/common/Clock.cc b/branches/sage/cephmds2/common/Clock.cc
new file mode 100644
index 0000000000000..c970a337826b6
--- /dev/null
+++ b/branches/sage/cephmds2/common/Clock.cc
@@ -0,0 +1,19 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "Clock.h"
+
+// public
+Clock g_clock;
+
diff --git a/branches/sage/cephmds2/common/Clock.h b/branches/sage/cephmds2/common/Clock.h
new file mode 100644
index 0000000000000..c1789dedc2461
--- /dev/null
+++ b/branches/sage/cephmds2/common/Clock.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __CLOCK_H
+#define __CLOCK_H
+
+#include <iostream>
+#include <iomanip>
+
+#include <sys/time.h>
+#include <time.h>
+#include <math.h>
+
+#include "Mutex.h"
+
+
+// --------
+// utime_t
+
+class utime_t {
+ private:
+  struct timeval tv;
+
+  struct timeval& timeval()  { return tv; }
+  friend class Clock;
+
+ 
+ public:
+  void normalize() {
+    if (tv.tv_usec > 1000*1000) {
+      tv.tv_sec += tv.tv_usec / (1000*1000);
+      tv.tv_usec %= 1000*1000;
+    }
+  }
+
+  // cons
+  utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); }
+  utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); }
+  
+  // accessors
+  time_t        sec()  const { return tv.tv_sec; } 
+  long          usec() const { return tv.tv_usec; }
+  int           nsec() const { return tv.tv_usec*1000; }
+
+  // ref accessors/modifiers
+  time_t&         sec_ref()  { return tv.tv_sec; } 
+  long&           usec_ref() { return tv.tv_usec; }
+
+  // cast to double
+  operator double() {
+    return (double)sec() + ((double)usec() / 1000000.0L);
+  }
+};
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+  return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L,
+                  (l.usec()+r.usec())%1000000L );
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+  l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L;
+  l.usec_ref() += r.usec();
+  l.usec_ref() %= 1000000L;
+  return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+  double fs = trunc(f);
+  double us = (f - fs) / (double)1000000.0;
+  l.sec_ref() += (long)fs;
+  l.usec_ref() += (long)us;
+  l.normalize();
+  return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+  return utime_t( l.sec() - r.sec() - (l.usec()<r.usec() ? 1:0),
+                  l.usec() - r.usec() + (l.usec()<r.usec() ? 1000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+  l.sec_ref() -= r.sec();
+  if (l.usec() >= r.usec())
+    l.usec_ref() -= r.usec();
+  else {
+    l.usec_ref() += 1000000L - r.usec();
+    l.sec_ref()--;
+  }
+  return l;
+}
+
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec());
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec());
+}
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+  //return out << t.sec() << "." << t.usec();
+  out << (long)t.sec() << ".";
+  out.setf(std::ios::right);
+  out.fill('0');
+  out << std::setw(6) << t.usec();
+  out.unsetf(std::ios::right);
+  return out;
+  
+  //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf();
+}
+
+
+
+
+// -- clock --
+class Clock {
+ protected:
+  //utime_t start_offset;
+  //utime_t abs_last;
+  utime_t last;
+  utime_t zero;
+
+  Mutex lock;
+
+ public:
+  Clock() {
+    // set offset
+    tare();
+  }
+
+  // real time.
+  utime_t real_now() {
+    utime_t realnow = now();
+    realnow += zero;
+    //gettimeofday(&realnow.timeval(), NULL);
+    return realnow;
+  }
+
+  // relative time (from startup)
+  void tare() {
+    gettimeofday(&zero.timeval(), NULL);
+  }
+  utime_t now() {
+    //lock.Lock();  
+    utime_t n;
+    gettimeofday(&n.timeval(), NULL);
+    n -= zero;
+    if (n < last) {
+      //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl;
+      n = last;    // clock jumped backwards!
+    } else
+      last = n;
+    //lock.Unlock();
+    return n;
+  }
+  utime_t recent_now() {
+    return last;
+  }
+
+  void realify(utime_t& t) {
+    t += zero;
+  }
+
+  void make_timespec(utime_t& t, struct timespec *ts) {
+    utime_t real = t;
+    realify(real);
+
+    memset(ts, 0, sizeof(*ts));
+    ts->tv_sec = real.sec();
+    ts->tv_nsec = real.nsec();
+  }
+
+
+
+  // absolute time
+  time_t gettime() {
+    return real_now().sec();
+  }
+
+};
+
+extern Clock g_clock;
+
+#endif
diff --git a/branches/sage/cephmds2/common/Cond.h b/branches/sage/cephmds2/common/Cond.h
new file mode 100644
index 0000000000000..ed465ce3762d6
--- /dev/null
+++ b/branches/sage/cephmds2/common/Cond.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __COND_H
+#define __COND_H
+
+#include <time.h>
+
+#include "Mutex.h"
+#include "Clock.h"
+
+#include "include/Context.h"
+
+#include <pthread.h>
+#include <cassert>
+
+class Cond {
+  // my bits
+  pthread_cond_t _c;
+
+  // don't allow copying.
+  void operator=(Cond &C) {}
+  Cond( const Cond &C ) {}
+
+ public:
+  Cond() {
+    int r = pthread_cond_init(&_c,NULL);
+    assert(r == 0);
+  }
+  virtual ~Cond() { 
+    pthread_cond_destroy(&_c); 
+  }
+
+  int Wait(Mutex &mutex)  { 
+    int r = pthread_cond_wait(&_c, &mutex._m);
+    return r;
+  }
+
+  int Wait(Mutex &mutex, char* s)  { 
+    //cout << "Wait: " << s << endl;
+    int r = pthread_cond_wait(&_c, &mutex._m);
+    return r;
+  }
+
+  int WaitUntil(Mutex &mutex, utime_t when) {
+    struct timespec ts;
+    g_clock.make_timespec(when, &ts);
+    //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl;
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    return r;
+  }
+  int WaitInterval(Mutex &mutex, utime_t interval) {
+    utime_t when = g_clock.now();
+    when += interval;
+    return WaitUntil(mutex, when);
+  }
+
+  int Signal() { 
+    //int r = pthread_cond_signal(&_c);
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int SignalOne() { 
+    int r = pthread_cond_signal(&_c);
+    return r;
+  }
+  int SignalAll() { 
+    //int r = pthread_cond_signal(&_c);
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+};
+
+class C_Cond : public Context {
+  Cond *cond;
+  bool *done;
+  int *rval;
+public:
+  C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) {
+    if (rval) *rval = r;
+    *done = true;
+    cond->Signal();
+  }
+};
+
+class C_SafeCond : public Context {
+  Mutex *lock;
+  Cond *cond;
+  bool *done;
+  int *rval;
+public:
+  C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) {
+    lock->Lock();
+    if (rval) *rval = r;
+    *done = true;
+    cond->Signal();
+    lock->Unlock();
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/common/DecayCounter.h b/branches/sage/cephmds2/common/DecayCounter.h
new file mode 100644
index 0000000000000..b95ebea815b7c
--- /dev/null
+++ b/branches/sage/cephmds2/common/DecayCounter.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __DECAYCOUNTER_H
+#define __DECAYCOUNTER_H
+
+#include <math.h>
+#include "Clock.h"
+
+#include "config.h"
+
+class DecayCounter {
+ protected:
+  double val;              // value
+
+  double half_life;        // in seconds
+  double k;                // k = ln(.5)/half_life
+
+  utime_t last_decay;   // time of last decay
+
+ public:
+  DecayCounter() : val(0) {
+    set_halflife( g_conf.mds_decay_halflife );
+    reset();
+  }
+  /*
+  DecayCounter(double hl) : val(0) {
+    set_halflife(hl);
+    reset();
+  }
+  */
+  
+  void adjust(double a) {
+    decay();
+    val += a;
+  }
+  void adjust_down(const DecayCounter& other) {
+    // assume other has same time stamp as us...
+    val -= other.val;
+  }
+
+  void set_halflife(double hl) {
+    half_life = hl;
+    k = log(.5) / hl;
+  }
+
+  void take(DecayCounter& other) {
+    *this = other;
+    other.reset();
+  }
+
+  void reset() {
+    last_decay.sec_ref() = 0;
+    last_decay.usec_ref() = 0;
+    val = 0;
+  }
+  
+  void decay() {
+    utime_t el = g_clock.recent_now();
+    el -= last_decay;
+    if (el.sec() >= 1) {
+      val = val * exp((double)el * k);
+      if (val < .01) val = 0;
+      last_decay = g_clock.recent_now();
+    }
+  }
+
+  double get() {
+    decay();
+    return val;
+  }
+
+  double hit(double v = 1.0) {
+    decay();
+    val += v;
+    return val;
+  }
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/common/LogType.h b/branches/sage/cephmds2/common/LogType.h
new file mode 100644
index 0000000000000..3de17751ec2f8
--- /dev/null
+++ b/branches/sage/cephmds2/common/LogType.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __LOGTYPE_H
+#define __LOGTYPE_H
+
+#include "include/types.h"
+
+#include <string>
+#include <fstream>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+#include "Mutex.h"
+
+
+class LogType {
+ protected:
+  hash_map<__uint64_t, int> keymap;  
+  vector<const char*>   keys;
+  set<int>              inc_keys;
+
+  int version;
+
+  // HACK to avoid the hash table as often as possible...
+  // cache recent key name lookups in a small ring buffer
+  const static int cache_keys = 10;
+  __uint64_t kc_ptr[cache_keys];
+  int kc_val[cache_keys];
+  int kc_pos;
+
+  friend class Logger;
+
+ public:
+  LogType() {
+    version = 1;
+
+    for (int i=0;i<cache_keys;i++)
+      kc_ptr[i] = 0;
+    kc_pos = 0;
+  }
+  int add_key(const char* key, bool is_inc) {
+    int i = lookup_key(key);
+    if (i >= 0) return i;
+
+    i = keys.size();
+    keys.push_back(key);
+
+#ifdef __LP64__
+    __uint64_t p = (__uint64_t)key;
+#else
+    __uint64_t p = (__uint32_t)key;
+#endif
+    keymap[p] = i;
+    if (is_inc) inc_keys.insert(i);
+
+    version++;
+    return i;
+  }
+  int add_inc(const char* key) {
+    return add_key(key, true);
+  }
+  int add_set(const char *key) {
+    return add_key(key, false);
+  }
+  
+  bool have_key(const char* key) {
+    return lookup_key(key) < 0;
+  }
+
+  int lookup_key(const char* key) {
+#ifdef __LP64__
+    __uint64_t p = (__uint64_t)key;
+#else
+    __uint64_t p = (__uint32_t)key;
+#endif
+
+    if (keymap.count(p)) 
+      return keymap[p];
+
+    // try kc ringbuffer
+    int pos = kc_pos-1;
+    for (int j=0; j<cache_keys; j++) {
+      if (pos < 0) pos = cache_keys - 1;
+      if (kc_ptr[pos] == p) return kc_val[pos];
+      pos--;
+    }
+
+    for (unsigned i=0; i<keys.size(); i++)
+      if (strcmp(keys[i], key) == 0) {
+        keymap[p] = i;
+
+        // put in kc ringbuffer
+        kc_ptr[kc_pos] = p;
+        kc_val[kc_pos] = i;
+        kc_pos++;
+        if (kc_pos == cache_keys) kc_pos = 0;
+
+        return i; 
+      }
+    return -1;
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/common/Logger.cc b/branches/sage/cephmds2/common/Logger.cc
new file mode 100644
index 0000000000000..37ceb22321d8f
--- /dev/null
+++ b/branches/sage/cephmds2/common/Logger.cc
@@ -0,0 +1,206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <string>
+
+#include "LogType.h"
+#include "Logger.h"
+
+#include <iostream>
+#include "Clock.h"
+
+#include "config.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+
+// per-process lock.  lame, but this way I protect LogType too!
+Mutex logger_lock;
+
+Logger::Logger(string fn, LogType *type)
+{
+  logger_lock.Lock();
+  {
+    filename = "log/";
+    if (g_conf.log_name) {
+      filename += g_conf.log_name;
+      ::mkdir( filename.c_str(), 0755 );   // make sure dir exists
+      filename += "/";
+    }
+    filename += fn;
+    //cout << "log " << filename << endl;
+    interval = g_conf.log_interval;
+    
+    start = g_clock.now();  // time 0!
+    last_logged = 0;
+    wrote_header = -1;
+    open = false;
+    this->type = type;
+    wrote_header_last = 0;
+    
+    version = 0;
+  }
+  logger_lock.Unlock();
+  flush(false);
+}
+
+Logger::~Logger()
+{
+  flush(true);
+  out.close();
+}
+
+long Logger::inc(const char *key, long v)
+{
+  if (!g_conf.log) return 0;
+  logger_lock.Lock();
+  int i = type->lookup_key(key);
+  if (i < 0) i = type->add_inc(key);
+  flush();
+  vals[i] += v;
+  long r = vals[i];
+  logger_lock.Unlock();
+  return r;
+}
+
+double Logger::finc(const char *key, double v)
+{
+  if (!g_conf.log) return 0;
+  logger_lock.Lock();
+  int i = type->lookup_key(key);
+  if (i < 0) i = type->add_inc(key);
+  flush();
+  fvals[i] += v;
+  double r = fvals[i];
+  logger_lock.Unlock();
+  return r;
+}
+
+long Logger::set(const char *key, long v)
+{
+  if (!g_conf.log) return 0;
+  logger_lock.Lock();
+  int i = type->lookup_key(key);
+  if (i < 0) i = type->add_set(key);
+  flush();
+  long r = vals[i] = v;
+  logger_lock.Unlock();
+  return r;
+}
+
+
+double Logger::fset(const char *key, double v)
+{
+  if (!g_conf.log) return 0;
+  logger_lock.Lock();
+  int i = type->lookup_key(key);
+  if (i < 0) i = type->add_set(key);
+  flush();
+  double r = fvals[i] = v;
+  logger_lock.Unlock();
+  return r;
+}
+
+long Logger::get(const char* key)
+{
+  if (!g_conf.log) return 0;
+  logger_lock.Lock();
+  int i = type->lookup_key(key);
+  long r = 0;
+  if (i >= 0 && (int)vals.size() > i)
+                r = vals[i];
+  logger_lock.Unlock();
+  return r;
+}
+
+void Logger::flush(bool force) 
+{
+  if (!g_conf.log) return;
+  logger_lock.Lock();
+        
+  if (version != type->version) {
+    while (type->keys.size() > vals.size())
+      vals.push_back(0);
+    while (type->keys.size() > fvals.size())
+      fvals.push_back(0);
+    version = type->version;
+  }
+  
+  if (!open) {
+    out.open(filename.c_str(), ofstream::out);
+    open = true;
+    //cout << "opening log file " << filename << endl;
+  }
+  
+  utime_t fromstart = g_clock.now();
+  if (fromstart < start) {
+    cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl;
+    assert(0);
+    start = fromstart;
+  }
+  fromstart -= start;
+        
+  while (force ||
+         ((fromstart.sec() > last_logged) &&
+          (fromstart.sec() - last_logged >= interval))) {
+    last_logged += interval;
+    force = false;
+    
+    //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl;
+    
+    if (!open) {
+      out.open(filename.c_str(), ofstream::out);
+      open = true;
+      //cout << "opening log file " << filename << endl;
+    }
+    
+    // header?
+    wrote_header_last++;
+    if (wrote_header != type->version ||
+        wrote_header_last > 10) {
+      out << "#" << type->keymap.size();
+      for (unsigned i=0; i<type->keys.size(); i++) 
+        out << "\t" << type->keys[i];
+      out << endl;  //out << "\t (" << type->keymap.size() << ")" << endl;
+      wrote_header = type->version;
+      wrote_header_last = 0;
+    }
+    
+    // write line to log
+    out << last_logged;
+    for (unsigned i=0; i<type->keys.size(); i++) {
+      if (fvals[i] > 0 && vals[i] == 0)
+        out << "\t" << fvals[i];
+      else
+        out << "\t" << vals[i];
+    }
+    out << endl;
+    
+    // reset the counters
+    for (unsigned i=0; i<type->keys.size(); i++) {
+      if (type->inc_keys.count(i)) {
+        this->vals[i] = 0;
+        this->fvals[i] = 0;
+      }
+    }
+  }
+  
+  logger_lock.Unlock();
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/common/Logger.h b/branches/sage/cephmds2/common/Logger.h
new file mode 100644
index 0000000000000..85102acd90370
--- /dev/null
+++ b/branches/sage/cephmds2/common/Logger.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __LOGGER_H
+#define __LOGGER_H
+
+#include "include/types.h"
+#include "Clock.h"
+#include "Mutex.h"
+
+#include <string>
+#include <fstream>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "LogType.h"
+
+
+
+
+class Logger {
+ protected:
+  //hash_map<const char*, long, hash<const char*>, eqstr> vals;
+  //hash_map<const char*, double, hash<const char*>, eqstr> fvals;
+  vector<long> vals;
+  vector<double> fvals;
+
+  //Mutex lock;
+  LogType *type;
+
+  utime_t start;
+  int last_logged;
+  int interval;
+  int wrote_header;
+  int wrote_header_last;
+
+  string filename;
+
+  int version;
+
+  ofstream out;
+  bool open;
+
+ public:
+  Logger(string fn, LogType *type);
+  ~Logger();
+
+  void set_start(const utime_t& a) { start = a; }
+  utime_t& get_start() { return start; }
+
+  long inc(const char *s, long v = 1);
+  long set(const char *s, long v);
+  long get(const char *s);
+
+  double fset(const char *s, double v);
+  double finc(const char *s, double v);
+
+  void flush(bool force = false);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/common/Mutex.h b/branches/sage/cephmds2/common/Mutex.h
new file mode 100755
index 0000000000000..c4615a3ff4c6e
--- /dev/null
+++ b/branches/sage/cephmds2/common/Mutex.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MUTEX_H
+#define __MUTEX_H
+
+#include <pthread.h>
+#include <cassert>
+
+class Mutex {
+private:
+  pthread_mutex_t _m;
+  int nlock;
+  bool recursive;
+
+  // don't allow copying.
+  void operator=(Mutex &M) {}
+  Mutex( const Mutex &M ) {}
+
+public:
+  Mutex(bool r = true) : nlock(0), recursive(r) {
+    if (recursive) {
+      pthread_mutexattr_t attr;
+      pthread_mutexattr_init(&attr);
+      pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE);
+      pthread_mutex_init(&_m,&attr);
+      pthread_mutexattr_destroy(&attr);
+    } else {
+      pthread_mutex_init(&_m,NULL);
+    }
+  }
+  virtual ~Mutex() {
+    assert(nlock == 0);
+    pthread_mutex_destroy(&_m); 
+  }
+
+  bool is_locked() {
+    return (nlock > 0);
+  }
+
+  void Lock() {
+    int r = pthread_mutex_lock(&_m);
+    assert(r == 0);
+    nlock++;
+    assert(nlock == 1 || recursive);
+  }
+
+  void Unlock() {
+    assert(nlock > 0);
+    --nlock;
+    int r = pthread_mutex_unlock(&_m);
+    assert(r == 0);
+  }
+
+  friend class Cond;
+};
+
+#endif
diff --git a/branches/sage/cephmds2/common/Semaphore.h b/branches/sage/cephmds2/common/Semaphore.h
new file mode 100644
index 0000000000000..7526f5c1ec9c8
--- /dev/null
+++ b/branches/sage/cephmds2/common/Semaphore.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef _Sem_Posix_
+#define _Sem_Posix_
+
+#include <cassert>
+
+class Semaphore
+{
+  Mutex m;
+  Cond c;
+  int count;
+
+  public:
+
+  Semaphore()
+  {
+    count = 0;
+  }
+
+  void Put()
+  { 
+    m.Lock();
+    count++;
+    c.Signal();
+    m.Unlock();
+  }
+
+  void Get() 
+  { 
+    m.Lock();
+    while(count <= 0) {
+      c.Wait(m);
+    }
+    count--;
+    m.Unlock();
+  }
+};
+
+#endif // !_Mutex_Posix_
diff --git a/branches/sage/cephmds2/common/Thread.h b/branches/sage/cephmds2/common/Thread.h
new file mode 100644
index 0000000000000..43e2942e84c5f
--- /dev/null
+++ b/branches/sage/cephmds2/common/Thread.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __THREAD_H
+#define __THREAD_H
+
+#include <pthread.h>
+
+class Thread {
+ private:
+  pthread_t thread_id;
+
+ public:
+  Thread() : thread_id(0) {}
+  virtual ~Thread() {}
+
+  pthread_t &get_thread_id() { return thread_id; }
+  bool is_started() { return thread_id != 0; }
+
+  virtual void *entry() = 0;
+
+ private:
+  static void *_entry_func(void *arg) {
+    return ((Thread*)arg)->entry();
+  }
+
+ public:
+  int create() {
+    return pthread_create( &thread_id, NULL, _entry_func, (void*)this );
+  }
+
+  bool am_self() {
+    return (pthread_self() == thread_id);
+  }
+
+  int join(void **prval = 0) {
+    if (thread_id == 0) return -1;   // never started.
+    int status = pthread_join(thread_id, prval);
+    if (status == 0) 
+      thread_id = 0;
+    else {
+      cout << "join status = " << status << endl;
+      assert(0);
+    }
+    return status;
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/common/ThreadPool.h b/branches/sage/cephmds2/common/ThreadPool.h
new file mode 100644
index 0000000000000..674053bfe1087
--- /dev/null
+++ b/branches/sage/cephmds2/common/ThreadPool.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef THREADPOOL
+#define THREADPOOL
+
+#include <list>
+using namespace std;
+
+
+#include <pthread.h>
+#include <common/Mutex.h>
+#include <common/Cond.h>
+#include <common/Semaphore.h>
+
+
+// debug output
+#include "config.h"
+#define tpdout(x) if (x <= g_conf.debug) cout << myname 
+#define DBLVL 15
+
+
+using namespace std;
+ 
+#define MAX_THREADS 1000
+
+template <class U, class T>
+class ThreadPool {
+
+ private:
+  list<T> q;
+  Mutex q_lock;
+  Semaphore q_sem;
+
+  int num_ops;
+  int num_threads;
+  vector<pthread_t> thread;
+
+  U u;
+  void (*func)(U,T);
+  void (*prefunc)(U,T);
+  string myname;
+
+  static void *foo(void *arg)
+  {
+    ThreadPool *t = (ThreadPool *)arg;
+    t->do_ops(arg);
+    return 0;
+  }
+
+  void *do_ops(void *nothing)
+  {
+    tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl;
+    while (1) {
+      q_sem.Get();
+      if (q.empty()) break;
+
+      T op = get_op();
+      tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl;
+      func(u, op);
+    }
+    tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl;
+    return 0;
+  }
+
+
+  T get_op()
+  {
+    T op;
+    q_lock.Lock();
+    {
+      op = q.front();
+      q.pop_front();
+      num_ops--;
+      
+      if (prefunc && op) {
+        tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl;
+        prefunc(u, op);
+      }
+    }
+    q_lock.Unlock();
+
+    return op;
+  }
+
+ public:
+
+  ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) :
+    num_ops(0), num_threads(howmany), 
+    thread(num_threads),
+    u(obj),
+    func(f), prefunc(pf), 
+    myname(myname) {
+    tpdout(DBLVL) << ".cons num_threads " << num_threads << endl;
+    
+    // start threads
+    int status;
+    for(int i = 0; i < howmany; i++) {
+      status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this);
+      assert(status == 0);
+    }
+  }
+  
+  ~ThreadPool() {
+    // bump sem to make threads exit cleanly
+    for(int i = 0; i < num_threads; i++) 
+      q_sem.Put();
+    
+    // wait for them to die
+    for(int i = 0; i < num_threads; i++) {
+      tpdout(DBLVL) << ".des joining thread " << thread[i] << endl;
+      void *rval = 0;  // we don't actually care
+      pthread_join(thread[i], &rval);
+    }
+  }
+  
+  void put_op(T op) {
+    tpdout(DBLVL) << ".put_op " << op << endl;
+    q_lock.Lock();
+    q.push_back(op);
+    num_ops++;
+    q_sem.Put();
+    q_lock.Unlock();
+  }
+
+};
+#endif
diff --git a/branches/sage/cephmds2/common/Timer.cc b/branches/sage/cephmds2/common/Timer.cc
new file mode 100644
index 0000000000000..d70259c3e0a08
--- /dev/null
+++ b/branches/sage/cephmds2/common/Timer.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+
+#include "Timer.h"
+#include "Cond.h"
+
+#include "config.h"
+#include "include/Context.h"
+
+#undef dout
+#define dout(x)  if (x <= g_conf.debug) cout << "Timer: "
+
+#define DBL 10
+
+#include <signal.h>
+#include <sys/time.h>
+#include <math.h>
+
+// single global instance
+Timer      g_timer;
+
+
+/**** thread solution *****/
+
+void Timer::timer_entry()
+{
+  lock.Lock();
+  
+  while (!thread_stop) {
+    
+    // now
+    utime_t now = g_clock.now();
+
+    // any events due?
+    utime_t next;
+    Context *event = get_next_scheduled(next);
+    
+    list<Context*> pending;
+    
+    if (event && now >= next) {
+      // move to pending list
+      map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+      while (it != scheduled.end()) {
+        if (it->first > now) break;
+
+        utime_t t = it->first;
+        dout(DBL) << "queueing event(s) scheduled at " << t << endl;
+
+        for (multiset<Context*>::iterator cit = it->second.begin();
+             cit != it->second.end();
+             cit++) {
+          pending.push_back(*cit);
+          event_times.erase(*cit);
+          num_event--;
+        }
+
+        map< utime_t, multiset<Context*> >::iterator previt = it;
+        it++;
+        scheduled.erase(previt);
+      }
+
+      if (!pending.empty()) {
+        sleeping = false;
+        lock.Unlock();
+        { // make sure we're not holding any locks while we do callbacks
+          // make the callbacks myself.
+          for (list<Context*>::iterator cit = pending.begin();
+               cit != pending.end();
+               cit++) {
+            dout(DBL) << "doing callback " << *cit << endl;
+            (*cit)->finish(0);
+          }
+          pending.clear();
+          assert(pending.empty());
+        }
+        lock.Lock();
+      }
+
+    }
+
+    else {
+      // sleep
+      if (event) {
+        dout(DBL) << "sleeping until " << next << endl;
+        timed_sleep = true;
+        sleeping = true;
+        timeout_cond.WaitUntil(lock, next);  // wait for waker or time
+        utime_t now = g_clock.now();
+        dout(DBL) << "kicked or timed out at " << now << endl;
+      } else {
+        dout(DBL) << "sleeping" << endl;
+        timed_sleep = false;
+        sleeping = true;
+        sleep_cond.Wait(lock);         // wait for waker
+        utime_t now = g_clock.now();
+        dout(DBL) << "kicked at " << now << endl;
+      }
+    }
+  }
+
+  lock.Unlock();
+}
+
+
+
+/**
+ * Timer bits
+ */
+
+void Timer::register_timer()
+{
+  if (timer_thread.is_started()) {
+    if (sleeping) {
+      dout(DBL) << "register_timer kicking thread" << endl;
+      if (timed_sleep)
+        timeout_cond.SignalAll();
+      else
+        sleep_cond.SignalAll();
+    } else {
+      dout(DBL) << "register_timer doing nothing; thread is alive but not sleeping" << endl;
+      // it's probably doing callbacks.
+    }
+  } else {
+    dout(DBL) << "register_timer starting thread" << endl;
+    timer_thread.create();
+  }
+}
+
+void Timer::cancel_timer()
+{
+  // clear my callback pointers
+  if (timer_thread.is_started()) {
+    dout(10) << "setting thread_stop flag" << endl;
+    lock.Lock();
+    thread_stop = true;
+    if (timed_sleep)
+      timeout_cond.SignalAll();
+    else
+      sleep_cond.SignalAll();
+    lock.Unlock();
+    
+    dout(10) << "waiting for thread to finish" << endl;
+    void *ptr;
+    timer_thread.join(&ptr);
+    
+    dout(10) << "thread finished, exit code " << ptr << endl;
+  }
+}
+
+
+/*
+ * schedule
+ */
+
+
+void Timer::add_event_after(float seconds,
+                            Context *callback) 
+{
+  utime_t when = g_clock.now();
+  when.sec_ref() += (int)seconds;
+  add_event_at(when, callback);
+}
+
+void Timer::add_event_at(utime_t when,
+                         Context *callback) 
+{
+  // insert
+  dout(DBL) << "add_event " << callback << " at " << when << endl;
+
+  lock.Lock();
+  scheduled[ when ].insert(callback);
+  assert(event_times.count(callback) == 0);     // err.. there can be only one (for now!)
+  event_times[callback] = when;
+  
+  num_event++;
+
+  // make sure i wake up
+  register_timer();
+
+  lock.Unlock();
+}
+
+bool Timer::cancel_event(Context *callback) 
+{
+  lock.Lock();
+  
+  dout(DBL) << "cancel_event " << callback << endl;
+
+  if (!event_times.count(callback)) {
+    dout(DBL) << "cancel_event " << callback << " wasn't scheduled?" << endl;
+    lock.Unlock();
+    assert(0);
+    return false;     // wasn't scheduled.
+  }
+
+  utime_t tp = event_times[callback];
+  assert(scheduled.count(tp));
+
+  multiset<Context*>::iterator p = scheduled[tp].find(callback);  // there may be more than one?
+  assert(p != scheduled[tp].end());
+  scheduled[tp].erase(p);
+
+  event_times.erase(callback);
+  
+  lock.Unlock();
+  return true;
+}
diff --git a/branches/sage/cephmds2/common/Timer.h b/branches/sage/cephmds2/common/Timer.h
new file mode 100644
index 0000000000000..bd63d7173a3d3
--- /dev/null
+++ b/branches/sage/cephmds2/common/Timer.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __TIMER_H
+#define __TIMER_H
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "Clock.h"
+
+#include "Mutex.h"
+#include "Cond.h"
+#include "Thread.h"
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+/*** Timer
+ * schedule callbacks
+ */
+
+//class Messenger;
+
+
+namespace __gnu_cxx {
+  template<> struct hash<Context*> {
+    size_t operator()(const Context *p) const { 
+      static hash<unsigned long> H;
+      return H((unsigned long)p); 
+    }
+  };
+}
+
+
+class Timer {
+ private:
+  map< utime_t, multiset<Context*> >  scheduled;    // time -> (context ...)
+  hash_map< Context*, utime_t >  event_times;  // event -> time
+
+  // get time of the next event
+  Context* get_next_scheduled(utime_t& when) {
+    if (scheduled.empty()) return 0;
+    map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+    when = it->first;
+    multiset<Context*>::iterator sit = it->second.begin();
+    return *sit;
+  }
+
+  void register_timer();  // make sure i get a callback
+  void cancel_timer();    // make sure i get a callback
+
+  //pthread_t thread_id;
+  bool      thread_stop;
+  Mutex     lock;
+  bool      timed_sleep;
+  bool      sleeping;
+  Cond      sleep_cond;
+  Cond      timeout_cond;
+
+ public:
+  void timer_entry();    // waiter thread (that wakes us up)
+
+  class TimerThread : public Thread {
+    Timer *t;
+  public:
+    void *entry() {
+      t->timer_entry();
+      return 0;
+    }
+    TimerThread(Timer *_t) : t(_t) {}
+  } timer_thread;
+
+
+  int num_event;
+
+
+ public:
+  Timer() :
+    thread_stop(false),
+    timed_sleep(false),
+    sleeping(false),
+    timer_thread(this),
+    num_event(0)
+  { 
+  }
+  ~Timer() { 
+    // stop.
+    cancel_timer();
+
+    // scheduled
+    for (map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+         it != scheduled.end();
+         it++) {
+      for (multiset<Context*>::iterator sit = it->second.begin();
+           sit != it->second.end();
+           sit++)
+        delete *sit;
+    }
+    scheduled.clear();
+  }
+  
+  void init() {
+    register_timer();
+  }
+  void shutdown() {
+    cancel_timer();
+  }
+
+  // schedule events
+  void add_event_after(float seconds,
+                       Context *callback);
+  void add_event_at(utime_t when,
+                    Context *callback);
+  bool cancel_event(Context *callback);
+
+  // execute pending events
+  void execute_pending();
+
+};
+
+
+// single global instance
+extern Timer g_timer;
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/config.cc b/branches/sage/cephmds2/config.cc
new file mode 100644
index 0000000000000..fe7261f703cf0
--- /dev/null
+++ b/branches/sage/cephmds2/config.cc
@@ -0,0 +1,718 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "config.h"
+#include "include/types.h"
+
+//#define MDS_CACHE_SIZE        4*10000   -> <20mb
+//#define MDS_CACHE_SIZE        80000         62mb
+
+#define AVG_PER_INODE_SIZE    450
+#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE)
+
+//#define MDS_CACHE_SIZE       MDS_CACHE_MB_TO_INODES( 50 )
+//#define MDS_CACHE_SIZE 1500000
+#define MDS_CACHE_SIZE 150000
+
+
+// hack hack hack ugly FIXME
+#include "common/Mutex.h"
+long buffer_total_alloc = 0;
+Mutex bufferlock;
+
+
+
+FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 );  // stripe over 1M objects, 2x replication
+//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 );   // 128k stripes over sets of 4
+
+// ??
+//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 );  // this is stupid, but can bring out an ebofs table bug?
+FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 );  // 1M objects, 2x replication
+
+// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!)
+FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 );  // 1M objects
+//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 );  // 256 byte bits
+//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 );  // 128 byte stripes over 32 1M objects
+//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 );  // pathological case to test striping buffer mapping
+//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 );  // old way
+
+// fake osd failures: osd -> time
+std::map<int,float> g_fake_osd_down;
+std::map<int,float> g_fake_osd_out;
+
+md_config_t g_debug_after_conf;
+
+md_config_t g_conf = {
+  num_mon: 1,
+  num_mds: 1,
+  num_osd: 4,
+  num_client: 1,
+
+  mkfs: false,
+
+  // profiling and debugging
+  log: true,
+  log_interval: 1,
+  log_name: (char*)0,
+
+  log_messages: true,
+  log_pins: true,
+
+  fake_clock: false,
+  fakemessenger_serialize: true,
+
+  fake_osdmap_expand: 0,
+  fake_osdmap_updates: 0,
+  fake_osd_mttf: 0,
+  fake_osd_mttr: 0,
+
+  osd_remount_at: 0,
+
+  kill_after: 0,
+
+  tick: 0,
+
+  debug: 0,
+  debug_mds: 1,
+  debug_mds_balancer: 1,
+  debug_mds_log: 1,
+  debug_buffer: 0,
+  debug_filer: 0,
+  debug_objecter: 0,
+  debug_objectcacher: 0,
+  debug_client: 0,
+  debug_osd: 0,
+  debug_ebofs: 1,
+  debug_bdev: 1,         // block device
+  debug_ns: 0,
+  debug_ms: 0,
+  debug_mon: 0,
+  
+  debug_after: 0,
+  
+  // --- clock ---
+  clock_lock: false,
+  
+  // --- messenger ---
+  ms_single_dispatch: false,
+  ms_requeue_on_sender_fail: false,
+
+  ms_stripe_osds: false,
+  ms_skip_rank0: false,
+  ms_overlay_clients: false,
+
+  ms_die_on_failure: false,
+
+  /*tcp_skip_rank0: false,
+  tcp_overlay_clients: false,  // over osds!
+  tcp_log: false,
+  tcp_serial_marshall: true,
+  tcp_serial_out: false,
+  tcp_multi_out: true,
+  tcp_multi_dispatch: false,  // not fully implemented yet
+  */
+
+  // --- mon ---
+  mon_tick_interval: 5,
+  mon_osd_down_out_interval: 5,  // seconds
+  mon_lease: 2.000,  // seconds
+
+  // --- client ---
+  client_cache_size: 300,
+  client_cache_mid: .5,
+  client_cache_stat_ttl: 0, // seconds until cached stat results become invalid
+  client_cache_readdir_ttl: 1,  // 1 second only
+  client_use_random_mds:  false,
+
+  client_sync_writes: 0,
+
+  client_oc: true,
+  client_oc_size:      1024*1024* 5,    // MB * n
+  client_oc_max_dirty: 1024*1024* 5,    // MB * n
+  client_oc_max_sync_write: 128*1024,   // writes >= this use wrlock
+
+  client_trace: 0,
+  fuse_direct_io: 0,
+  
+  // --- objecter ---
+  objecter_buffer_uncommitted: true,
+
+  // --- journaler ---
+  journaler_allow_split_entries: false,
+
+  // --- mds ---
+  mds_cache_size: MDS_CACHE_SIZE,
+  mds_cache_mid: .7,
+
+  mds_decay_halflife: 30,
+
+  mds_log: true,
+  mds_log_max_len:  MDS_CACHE_SIZE / 3,
+  mds_log_max_trimming: 10000,
+  mds_log_read_inc: 1<<20,
+  mds_log_pad_entry: 128,//256,//64,
+  mds_log_before_reply: true,
+  mds_log_flush_on_shutdown: true,
+
+  mds_bal_replicate_threshold: 2000,
+  mds_bal_unreplicate_threshold: 0,//500,
+  mds_bal_hash_rd: 10000,
+  mds_bal_unhash_rd: 1000,
+  mds_bal_hash_wr: 10000,
+  mds_bal_unhash_wr: 1000,
+  mds_bal_interval: 30,           // seconds
+  mds_bal_hash_interval: 5,      // seconds
+  mds_bal_idle_threshold: .1,
+  mds_bal_max: -1,
+  mds_bal_max_until: -1,
+
+  mds_bal_mode: 0,
+  mds_bal_min_start: .2,      // if we need less than this, we don't do anything
+  mds_bal_need_min: .8,       // take within this range of what we need
+  mds_bal_need_max: 1.2,
+  mds_bal_midchunk: .3,       // any sub bigger than this taken in full
+  mds_bal_minchunk: .001,     // never take anything smaller than this
+
+  mds_commit_on_shutdown: true,
+  mds_shutdown_check: 0, //30,
+
+  mds_verify_export_dirauth: true,
+
+  mds_local_osd: false,
+
+
+  // --- osd ---
+  osd_rep: OSD_REP_PRIMARY,
+  osd_balance_reads: false,
+  osd_pg_bits: 0,  // 0 == let osdmonitor decide
+  osd_object_layout: OBJECT_LAYOUT_HASHINO,
+  osd_pg_layout: PG_LAYOUT_CRUSH,
+  osd_max_rep: 4,
+  osd_maxthreads: 2,    // 0 == no threading
+  osd_max_opq: 10,
+  osd_mkfs: false,
+  osd_age: .8,
+  osd_age_time: 0,
+  osd_heartbeat_interval: 5,   // shut up while i'm debugging
+  osd_replay_window: 5,
+  osd_max_pull: 2,
+  osd_pad_pg_log: false,
+  
+  // --- fakestore ---
+  fakestore_fake_sync: 2,    // 2 seconds
+  fakestore_fsync: false,//true,
+  fakestore_writesync: false,
+  fakestore_syncthreads: 4,
+  fakestore_fakeattr: true,   
+  fakestore_dev: 0,
+
+  // --- ebofs ---
+  ebofs: 1,
+  ebofs_cloneable: false,
+  ebofs_verify: false,
+  ebofs_commit_ms:      2000,       // 0 = no forced commit timeout (for debugging/tracing)
+  ebofs_idle_commit_ms: 100,        // 0 = no idle detection.  use this -or- bdev_idle_kick_after_ms
+  ebofs_oc_size:        10000,      // onode cache
+  ebofs_cc_size:        10000,      // cnode cache
+  ebofs_bc_size:        (80 *256), // 4k blocks, *256 for MB
+  ebofs_bc_max_dirty:   (60 *256), // before write() will block
+  ebofs_max_prefetch: 1000, // 4k blocks
+  ebofs_realloc: true,
+  
+  ebofs_abp_zero: false,          // zero newly allocated buffers (may shut up valgrind)
+  ebofs_abp_max_alloc: 4096*16,   // max size of new buffers (larger -> more memory fragmentation)
+
+  // --- obfs ---
+  uofs: 0,
+  uofs_fake_sync: 2,      // 2 seconds
+  uofs_cache_size:             1 << 28,        //256MB
+  uofs_onode_size:             (int)1024,
+  uofs_small_block_size:       (int)4096,      //4KB
+  uofs_large_block_size:       (int)524288,    //512KB
+  uofs_segment_size:           (int)268435456, //256MB
+  uofs_block_meta_ratio:       (int)10,
+  uofs_sync_write:             (int)0,
+  uofs_nr_hash_buckets:        (int)1023,
+  uofs_flush_interval:         (int)5,         //seconds
+  uofs_min_flush_pages:        (int)1024,      //4096 4k-pages
+  uofs_delay_allocation:       (int)1,         //true
+
+  // --- block device ---
+  bdev_lock: true,
+  bdev_iothreads:    1,         // number of ios to queue with kernel
+  bdev_idle_kick_after_ms: 0,//100, // ms   ** FIXME ** this seems to break things, not sure why yet **
+  bdev_el_fw_max_ms: 10000,      // restart elevator at least once every 1000 ms
+  bdev_el_bw_max_ms: 3000,       // restart elevator at least once every 300 ms
+  bdev_el_bidir: true,          // bidirectional elevator?
+  bdev_iov_max: 512,            // max # iov's to collect into a single readv()/writev() call
+  bdev_debug_check_io_overlap: true,   // [DEBUG] check for any pending io overlaps
+  bdev_fake_mb: 0,
+  bdev_fake_max_mb:  0,
+
+  // --- fakeclient (mds regression testing) (ancient history) ---
+  num_fakeclient: 100,
+  fakeclient_requests: 100,
+  fakeclient_deterministic: false,
+
+  fakeclient_op_statfs:     false,
+
+  // loosely based on Roselli workload paper numbers
+  fakeclient_op_stat:     610,
+  fakeclient_op_lstat:      false,
+  fakeclient_op_utime:    0,
+  fakeclient_op_chmod:    1,
+  fakeclient_op_chown:    1,
+
+  fakeclient_op_readdir:  2,
+  fakeclient_op_mknod:    30,
+  fakeclient_op_link:     false,
+  fakeclient_op_unlink:   20,
+  fakeclient_op_rename:   0,//40,
+
+  fakeclient_op_mkdir:    10,
+  fakeclient_op_rmdir:    20,
+  fakeclient_op_symlink:  20,
+
+  fakeclient_op_openrd:   200,
+  fakeclient_op_openwr:   0,
+  fakeclient_op_openwrc:  0,
+  fakeclient_op_read:       false,  // osd!
+  fakeclient_op_write:      false,  // osd!
+  fakeclient_op_truncate:   false,
+  fakeclient_op_fsync:      false,
+  fakeclient_op_close:    200
+};
+
+
+#include <stdlib.h>
+#include <string.h>
+
+
+void env_to_vec(std::vector<char*>& args) 
+{
+  const char *p = getenv("CEPH_ARGS");
+  if (!p) return;
+  
+  static char buf[1000];  
+  int len = strlen(p);
+  memcpy(buf, p, len);
+  buf[len] = 0;
+  //cout << "CEPH_ARGS " << buf << endl;
+
+  int l = 0;
+  for (int i=0; i<len; i++) {
+    if (buf[i] == ' ') {
+      buf[i] = 0;
+      args.push_back(buf+l);
+      //cout << "arg " << (buf+l) << endl;
+      l = i+1;
+    }
+  }
+  args.push_back(buf+l);
+  //cout << "arg " << (buf+l) << endl;
+}
+
+
+void argv_to_vec(int argc, char **argv,
+                 std::vector<char*>& args)
+{
+  for (int i=1; i<argc; i++)
+    args.push_back(argv[i]);
+}
+
+void vec_to_argv(std::vector<char*>& args,
+                 int& argc, char **&argv)
+{
+  argv = (char**)malloc(sizeof(char*) * argc);
+  argc = 1;
+  argv[0] = "asdf";
+
+  for (unsigned i=0; i<args.size(); i++) 
+    argv[argc++] = args[i];
+}
+
+void parse_config_options(std::vector<char*>& args)
+{
+  std::vector<char*> nargs;
+
+  for (unsigned i=0; i<args.size(); i++) {
+    if (strcmp(args[i], "--nummon") == 0) 
+      g_conf.num_mon = atoi(args[++i]);
+    else if (strcmp(args[i], "--nummds") == 0) 
+      g_conf.num_mds = atoi(args[++i]);
+    else if (strcmp(args[i], "--numclient") == 0) 
+      g_conf.num_client = atoi(args[++i]);
+    else if (strcmp(args[i], "--numosd") == 0) 
+      g_conf.num_osd = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--ms_single_dispatch") == 0) 
+      g_conf.ms_single_dispatch = atoi(args[++i]);
+    else if (strcmp(args[i], "--ms_stripe_osds") == 0)
+      g_conf.ms_stripe_osds = true;
+    else if (strcmp(args[i], "--ms_skip_rank0") == 0)
+      g_conf.ms_skip_rank0 = true;
+    else if (strcmp(args[i], "--ms_overlay_clients") == 0)
+      g_conf.ms_overlay_clients = true;
+    else if (strcmp(args[i], "--ms_die_on_failure") == 0)
+      g_conf.ms_die_on_failure = true;
+
+    /*else if (strcmp(args[i], "--tcp_log") == 0)
+      g_conf.tcp_log = true;
+    else if (strcmp(args[i], "--tcp_multi_out") == 0)
+      g_conf.tcp_multi_out = atoi(args[++i]);
+    */
+
+    else if (strcmp(args[i], "--mkfs") == 0) 
+      g_conf.osd_mkfs = g_conf.mkfs = 1; //atoi(args[++i]);
+
+    else if (strcmp(args[i], "--fake_osdmap_expand") == 0) 
+      g_conf.fake_osdmap_expand = atoi(args[++i]);
+    else if (strcmp(args[i], "--fake_osdmap_updates") == 0) 
+      g_conf.fake_osdmap_updates = atoi(args[++i]);
+    else if (strcmp(args[i], "--fake_osd_mttf") == 0) 
+      g_conf.fake_osd_mttf = atoi(args[++i]);
+    else if (strcmp(args[i], "--fake_osd_mttr") == 0) 
+      g_conf.fake_osd_mttr = atoi(args[++i]);
+    else if (strcmp(args[i], "--fake_osd_down") == 0) {
+      int osd = atoi(args[++i]);
+      float when = atof(args[++i]);
+      g_fake_osd_down[osd] = when;
+    }
+    else if (strcmp(args[i], "--fake_osd_out") == 0) {
+      int osd = atoi(args[++i]);
+      float when = atof(args[++i]);
+      g_fake_osd_out[osd] = when;
+    }
+    else if (strcmp(args[i], "--osd_remount_at") == 0) 
+      g_conf.osd_remount_at = atoi(args[++i]);
+    //else if (strcmp(args[i], "--fake_osd_sync") == 0) 
+    //g_conf.fake_osd_sync = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--debug") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_mds") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_mds = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_mds = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_mds_balancer") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_mds_balancer = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_mds_balancer = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_mds_log") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_mds_log = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_mds_log = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_buffer") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_buffer = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_buffer = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_filer") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_filer = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_filer = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_objecter") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_objecter = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_objecter = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_objectcacher") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_objectcacher = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_objectcacher = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_client") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_client = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_client = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_osd") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_osd = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_osd = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_ebofs") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_ebofs = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_ebofs = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_bdev") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_bdev = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_bdev = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_ms") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_ms = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_ms = atoi(args[++i]);
+    else if (strcmp(args[i], "--debug_mon") == 0) 
+      if (!g_conf.debug_after) 
+        g_conf.debug_mon = atoi(args[++i]);
+      else 
+        g_debug_after_conf.debug_mon = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--debug_after") == 0) {
+      g_conf.debug_after = atoi(args[++i]);
+      g_debug_after_conf = g_conf;
+    }
+
+    else if (strcmp(args[i], "--log") == 0) 
+      g_conf.log = atoi(args[++i]);
+    else if (strcmp(args[i], "--log_name") == 0) 
+      g_conf.log_name = args[++i];
+
+    else if (strcmp(args[i], "--fakemessenger_serialize") == 0) 
+      g_conf.fakemessenger_serialize = atoi(args[++i]);
+
+
+    else if (strcmp(args[i], "--clock_lock") == 0) 
+      g_conf.clock_lock = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--objecter_buffer_uncommitted") == 0) 
+      g_conf.objecter_buffer_uncommitted = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_cache_size") == 0) 
+      g_conf.mds_cache_size = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_log") == 0) 
+      g_conf.mds_log = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_log_before_reply") == 0) 
+      g_conf.mds_log_before_reply = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_log_max_len") == 0) 
+      g_conf.mds_log_max_len = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_log_read_inc") == 0) 
+      g_conf.mds_log_read_inc = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_log_max_trimming") == 0) 
+      g_conf.mds_log_max_trimming = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_commit_on_shutdown") == 0) 
+      g_conf.mds_commit_on_shutdown = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_shutdown_check") == 0) 
+      g_conf.mds_shutdown_check = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0) 
+      g_conf.mds_log_flush_on_shutdown = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_decay_halflife") == 0) 
+      g_conf.mds_decay_halflife = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_bal_interval") == 0) 
+      g_conf.mds_bal_interval = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_rep") == 0) 
+      g_conf.mds_bal_replicate_threshold = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_unrep") == 0) 
+      g_conf.mds_bal_unreplicate_threshold = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_max") == 0) 
+      g_conf.mds_bal_max = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_max_until") == 0) 
+      g_conf.mds_bal_max_until = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_bal_hash_rd") == 0) 
+      g_conf.mds_bal_hash_rd = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_hash_wr") == 0) 
+      g_conf.mds_bal_hash_wr = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_unhash_rd") == 0) 
+      g_conf.mds_bal_unhash_rd = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_unhash_wr") == 0) 
+      g_conf.mds_bal_unhash_wr = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mds_bal_mode") == 0) 
+      g_conf.mds_bal_mode = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_min_start") == 0) 
+      g_conf.mds_bal_min_start = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_need_min") == 0) 
+      g_conf.mds_bal_need_min = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_need_max") == 0) 
+      g_conf.mds_bal_need_max = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_midchunk") == 0) 
+      g_conf.mds_bal_midchunk = atoi(args[++i]);
+    else if (strcmp(args[i], "--mds_bal_minchunk") == 0) 
+      g_conf.mds_bal_minchunk = atoi(args[++i]);
+    
+    else if (strcmp(args[i], "--mds_local_osd") == 0) 
+      g_conf.mds_local_osd = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--client_cache_size") == 0)
+      g_conf.client_cache_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_cache_stat_ttl") == 0)
+      g_conf.client_cache_stat_ttl = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_cache_readdir_ttl") == 0)
+      g_conf.client_cache_readdir_ttl = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_trace") == 0)
+      g_conf.client_trace = atoi(args[++i]);
+    else if (strcmp(args[i], "--fuse_direct_io") == 0)
+      g_conf.fuse_direct_io = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0)
+      g_conf.mon_osd_down_out_interval = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--client_sync_writes") == 0)
+      g_conf.client_sync_writes = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_oc") == 0)
+      g_conf.client_oc = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_oc_size") == 0)
+      g_conf.client_oc_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--client_oc_max_dirty") == 0)
+      g_conf.client_oc_max_dirty = atoi(args[++i]);
+
+
+    else if (strcmp(args[i], "--ebofs") == 0) 
+      g_conf.ebofs = 1;
+    else if (strcmp(args[i], "--ebofs_cloneable") == 0)
+      g_conf.ebofs_cloneable = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_verify") == 0)
+      g_conf.ebofs_verify = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_commit_ms") == 0)
+      g_conf.ebofs_commit_ms = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_idle_commit_ms") == 0)
+      g_conf.ebofs_idle_commit_ms = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_oc_size") == 0)
+      g_conf.ebofs_oc_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_cc_size") == 0)
+      g_conf.ebofs_cc_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_bc_size") == 0)
+      g_conf.ebofs_bc_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_bc_max_dirty") == 0)
+      g_conf.ebofs_bc_max_dirty = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_abp_max_alloc") == 0)
+      g_conf.ebofs_abp_max_alloc = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_max_prefetch") == 0)
+      g_conf.ebofs_max_prefetch = atoi(args[++i]);
+    else if (strcmp(args[i], "--ebofs_realloc") == 0)
+      g_conf.ebofs_realloc = atoi(args[++i]);
+
+
+    else if (strcmp(args[i], "--fakestore") == 0) {
+      g_conf.ebofs = 0;
+      //g_conf.osd_pg_bits = 5;
+      //g_conf.osd_maxthreads = 1;   // fucking hell
+    }
+    else if (strcmp(args[i], "--fakestore_fsync") == 0) 
+      g_conf.fakestore_fsync = atoi(args[++i]);
+    else if (strcmp(args[i], "--fakestore_writesync") == 0) 
+      g_conf.fakestore_writesync = atoi(args[++i]);
+    else if (strcmp(args[i], "--fakestore_dev") == 0) 
+      g_conf.fakestore_dev = args[++i];
+
+    else if (strcmp(args[i], "--obfs") == 0) {
+      g_conf.uofs = 1;
+      g_conf.osd_maxthreads = 1;   // until feng merges joel's fixes
+    }
+
+
+    else if (strcmp(args[i], "--osd_balance_reads") == 0) 
+      g_conf.osd_balance_reads = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_rep") == 0) 
+      g_conf.osd_rep = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_rep_chain") == 0) 
+      g_conf.osd_rep = OSD_REP_CHAIN;
+    else if (strcmp(args[i], "--osd_rep_splay") == 0) 
+      g_conf.osd_rep = OSD_REP_SPLAY;
+    else if (strcmp(args[i], "--osd_rep_primary") == 0) 
+      g_conf.osd_rep = OSD_REP_PRIMARY;
+    else if (strcmp(args[i], "--osd_mkfs") == 0) 
+      g_conf.osd_mkfs = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_age") == 0) 
+      g_conf.osd_age = atof(args[++i]);
+    else if (strcmp(args[i], "--osd_age_time") == 0) 
+      g_conf.osd_age_time = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_pg_bits") == 0) 
+      g_conf.osd_pg_bits = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_max_rep") == 0) 
+      g_conf.osd_max_rep = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_maxthreads") == 0) 
+      g_conf.osd_maxthreads = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_max_pull") == 0) 
+      g_conf.osd_max_pull = atoi(args[++i]);
+    else if (strcmp(args[i], "--osd_pad_pg_log") == 0) 
+      g_conf.osd_pad_pg_log = atoi(args[++i]);
+
+
+    else if (strcmp(args[i], "--bdev_lock") == 0) 
+      g_conf.bdev_lock = atoi(args[++i]);
+    else if (strcmp(args[i], "--bdev_el_bidir") == 0) 
+      g_conf.bdev_el_bidir = atoi(args[++i]);
+    else if (strcmp(args[i], "--bdev_iothreads") == 0) 
+      g_conf.bdev_iothreads = atoi(args[++i]);
+    else if (strcmp(args[i], "--bdev_idle_kick_after_ms") == 0) 
+      g_conf.bdev_idle_kick_after_ms = atoi(args[++i]);
+    else if (strcmp(args[i], "--bdev_fake_mb") == 0) 
+      g_conf.bdev_fake_mb = atoi(args[++i]);
+    else if (strcmp(args[i], "--bdev_fake_max_mb") == 0) 
+      g_conf.bdev_fake_max_mb = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--osd_object_layout") == 0) {
+      i++;
+      if (strcmp(args[i], "linear") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_LINEAR;
+      else if (strcmp(args[i], "hashino") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASHINO;
+      else if (strcmp(args[i], "hash") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASH;
+      else assert(0);
+    }
+    
+    else if (strcmp(args[i], "--osd_pg_layout") == 0) {
+      i++;
+      if (strcmp(args[i], "linear") == 0) g_conf.osd_pg_layout = PG_LAYOUT_LINEAR;
+      else if (strcmp(args[i], "hash") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HASH;
+      else if (strcmp(args[i], "hybrid") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HYBRID;
+      else if (strcmp(args[i], "crush") == 0) g_conf.osd_pg_layout = PG_LAYOUT_CRUSH;
+      else assert(0);
+    }
+    
+    else if (strcmp(args[i], "--kill_after") == 0) 
+      g_conf.kill_after = atoi(args[++i]);
+    else if (strcmp(args[i], "--tick") == 0) 
+      g_conf.tick = atoi(args[++i]);
+
+    else if (strcmp(args[i], "--file_layout_ssize") == 0) 
+      g_OSD_FileLayout.stripe_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--file_layout_scount") == 0) 
+      g_OSD_FileLayout.stripe_count = atoi(args[++i]);
+    else if (strcmp(args[i], "--file_layout_osize") == 0) 
+      g_OSD_FileLayout.object_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--file_layout_num_rep") == 0) 
+      g_OSD_FileLayout.num_rep = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_dir_layout_ssize") == 0) 
+      g_OSD_MDDirLayout.stripe_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_dir_layout_scount") == 0) 
+      g_OSD_MDDirLayout.stripe_count = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_dir_layout_osize") == 0) 
+      g_OSD_MDDirLayout.object_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_dir_layout_num_rep") == 0) 
+      g_OSD_MDDirLayout.num_rep = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_log_layout_ssize") == 0) 
+      g_OSD_MDLogLayout.stripe_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_log_layout_scount") == 0) 
+      g_OSD_MDLogLayout.stripe_count = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_log_layout_osize") == 0) 
+      g_OSD_MDLogLayout.object_size = atoi(args[++i]);
+    else if (strcmp(args[i], "--meta_log_layout_num_rep") == 0) {
+      g_OSD_MDLogLayout.num_rep = atoi(args[++i]);
+      if (!g_OSD_MDLogLayout.num_rep)
+        g_conf.mds_log = false;
+    }
+
+    else {
+      nargs.push_back(args[i]);
+    }
+  }
+
+  args = nargs;
+}
diff --git a/branches/sage/cephmds2/config.h b/branches/sage/cephmds2/config.h
new file mode 100644
index 0000000000000..b3a9d73ee433a
--- /dev/null
+++ b/branches/sage/cephmds2/config.h
@@ -0,0 +1,297 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CONFIG_H
+#define __CONFIG_H
+
+extern class FileLayout g_OSD_FileLayout;
+extern class FileLayout g_OSD_MDDirLayout;
+extern class FileLayout g_OSD_MDLogLayout;
+
+#include <vector>
+#include <map>
+
+extern std::map<int,float> g_fake_osd_down;
+extern std::map<int,float> g_fake_osd_out;
+
+#define OSD_REP_PRIMARY 0
+#define OSD_REP_SPLAY   1
+#define OSD_REP_CHAIN   2
+
+struct md_config_t {
+  int  num_mon;
+  int  num_mds;
+  int  num_osd;
+  int  num_client;
+
+  bool mkfs;
+
+  // profiling
+  bool  log;
+  int   log_interval;
+  char *log_name;
+
+  bool log_messages;
+  bool log_pins;
+
+  bool fake_clock;
+  bool fakemessenger_serialize;
+
+  int fake_osdmap_expand;
+  int fake_osdmap_updates;
+  int fake_osd_mttf;
+  int fake_osd_mttr;
+
+  int osd_remount_at;
+
+  int kill_after;
+
+  int tick;
+
+  int debug;
+  int debug_mds;
+  int debug_mds_balancer;
+  int debug_mds_log;
+  int debug_buffer;
+  int debug_filer;
+  int debug_objecter;
+  int debug_objectcacher;
+  int debug_client;
+  int debug_osd;
+  int debug_ebofs;
+  int debug_bdev;
+  int debug_ns;
+  int debug_ms;
+  int debug_mon;
+
+  int debug_after;
+
+  // clock
+  bool clock_lock;
+
+  // messenger
+
+  /*bool tcp_skip_rank0;
+  bool tcp_overlay_clients;
+  bool tcp_log;
+  bool tcp_serial_marshall;
+  bool tcp_serial_out;
+  bool tcp_multi_out;
+  bool tcp_multi_dispatch;
+  */
+
+  bool ms_single_dispatch;
+  bool ms_requeue_on_sender_fail;
+
+  bool ms_stripe_osds;
+  bool ms_skip_rank0;
+  bool ms_overlay_clients;
+  bool ms_die_on_failure;
+
+  // mon
+  int mon_tick_interval;
+  int mon_osd_down_out_interval;
+  float mon_lease;
+
+  // client
+  int      client_cache_size;
+  float    client_cache_mid;
+  int      client_cache_stat_ttl;
+  int      client_cache_readdir_ttl;
+  bool     client_use_random_mds;          // debug flag
+
+  bool     client_sync_writes;
+
+  bool     client_oc;
+  int      client_oc_size;
+  int      client_oc_max_dirty;
+  size_t   client_oc_max_sync_write;
+
+  
+
+  /*
+  bool     client_bcache;
+  int      client_bcache_alloc_minsize;
+  int      client_bcache_alloc_maxsize;
+  int      client_bcache_ttl;
+  off_t    client_bcache_size;
+  int      client_bcache_lowater;
+  int      client_bcache_hiwater;
+  size_t   client_bcache_align;
+  */
+
+  int      client_trace;
+  int      fuse_direct_io;
+
+  // objecter
+  bool  objecter_buffer_uncommitted;
+
+  // journaler
+  bool  journaler_allow_split_entries;
+
+  // mds
+  int   mds_cache_size;
+  float mds_cache_mid;
+  
+  float mds_decay_halflife;
+
+  bool mds_log;
+  int mds_log_max_len;
+  int mds_log_max_trimming;
+  int mds_log_read_inc;
+  int mds_log_pad_entry;
+  bool  mds_log_before_reply;
+  bool  mds_log_flush_on_shutdown;
+  
+  float mds_bal_replicate_threshold;
+  float mds_bal_unreplicate_threshold;
+  float mds_bal_hash_rd;
+  float mds_bal_unhash_rd;
+  float mds_bal_hash_wr;
+  float mds_bal_unhash_wr;
+  int   mds_bal_interval;
+  int   mds_bal_hash_interval;
+  float mds_bal_idle_threshold;
+  int   mds_bal_max;
+  int   mds_bal_max_until;
+
+  int   mds_bal_mode;
+  float mds_bal_min_start;
+  float mds_bal_need_min;
+  float mds_bal_need_max;
+  float mds_bal_midchunk;
+  float mds_bal_minchunk;
+
+  bool  mds_commit_on_shutdown;
+  int   mds_shutdown_check;
+  bool  mds_verify_export_dirauth;     // debug flag
+
+  bool  mds_local_osd;
+
+
+  // osd
+  int   osd_rep;
+  bool  osd_balance_reads;
+  int   osd_pg_bits;
+  int   osd_object_layout;
+  int   osd_pg_layout;
+  int   osd_max_rep;
+  int   osd_maxthreads;
+  int   osd_max_opq;
+  bool  osd_mkfs;
+  float   osd_age;
+  int   osd_age_time;
+  int   osd_heartbeat_interval;
+  int   osd_replay_window;
+  int   osd_max_pull;
+  bool  osd_pad_pg_log;
+
+  int   fakestore_fake_sync;
+  bool  fakestore_fsync;
+  bool  fakestore_writesync;
+  int   fakestore_syncthreads;   // such crap
+  bool  fakestore_fakeattr;
+  char  *fakestore_dev;
+
+  // ebofs
+  int   ebofs;
+  bool  ebofs_cloneable;
+  bool  ebofs_verify;
+  int   ebofs_commit_ms;
+  int   ebofs_idle_commit_ms;
+  int   ebofs_oc_size;
+  int   ebofs_cc_size;
+  off_t ebofs_bc_size;
+  off_t ebofs_bc_max_dirty;
+  unsigned ebofs_max_prefetch;
+  bool  ebofs_realloc;
+
+  bool   ebofs_abp_zero;
+  size_t ebofs_abp_max_alloc;
+
+  int uofs;
+  int uofs_fake_sync;
+  int     uofs_cache_size;
+  int     uofs_onode_size;
+  int     uofs_small_block_size;
+  int     uofs_large_block_size;
+  int     uofs_segment_size;
+  int     uofs_block_meta_ratio;
+  int     uofs_sync_write;
+  
+  int     uofs_nr_hash_buckets;
+  int     uofs_flush_interval;
+  int     uofs_min_flush_pages;
+  int     uofs_delay_allocation;
+
+  // block device
+  bool  bdev_lock;
+  int   bdev_iothreads;
+  int   bdev_idle_kick_after_ms;
+  int   bdev_el_fw_max_ms;  
+  int   bdev_el_bw_max_ms;
+  bool  bdev_el_bidir;
+  int   bdev_iov_max;
+  bool  bdev_debug_check_io_overlap;
+  int   bdev_fake_mb;
+  int   bdev_fake_max_mb;
+
+  // fake client
+  int      num_fakeclient;
+  unsigned fakeclient_requests;
+  bool     fakeclient_deterministic;     // debug flag
+
+  int fakeclient_op_statfs;
+
+  int fakeclient_op_stat;
+  int fakeclient_op_lstat;
+  int fakeclient_op_utime;
+  int fakeclient_op_chmod;
+  int fakeclient_op_chown;
+
+  int fakeclient_op_readdir;
+  int fakeclient_op_mknod;
+  int fakeclient_op_link;
+  int fakeclient_op_unlink;
+  int fakeclient_op_rename;
+
+  int fakeclient_op_mkdir;
+  int fakeclient_op_rmdir;
+  int fakeclient_op_symlink;
+
+  int fakeclient_op_openrd;
+  int fakeclient_op_openwr;
+  int fakeclient_op_openwrc;
+  int fakeclient_op_read;
+  int fakeclient_op_write;
+  int fakeclient_op_truncate;
+  int fakeclient_op_fsync;
+  int fakeclient_op_close;
+
+};
+
+extern md_config_t g_conf;     
+extern md_config_t g_debug_after_conf;     
+
+#define dout(x)  if ((x) <= g_conf.debug) std::cout
+#define dout2(x) if ((x) <= g_conf.debug) std::cout
+
+void env_to_vec(std::vector<char*>& args);
+void argv_to_vec(int argc, char **argv,
+                 std::vector<char*>& args);
+void vec_to_argv(std::vector<char*>& args,
+                 int& argc, char **&argv);
+
+void parse_config_options(std::vector<char*>& args);
+
+#endif
diff --git a/branches/sage/cephmds2/cosd.cc b/branches/sage/cephmds2/cosd.cc
new file mode 100644
index 0000000000000..cb60ed492515b
--- /dev/null
+++ b/branches/sage/cephmds2/cosd.cc
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+
+#include "osd/OSD.h"
+#include "ebofs/Ebofs.h"
+
+#include "msg/NewMessenger.h"
+
+#include "common/Timer.h"
+
+
+class C_Die : public Context {
+public:
+  void finish(int) {
+    cerr << "die" << endl;
+    exit(1);
+  }
+};
+
+class C_Debug : public Context {
+  public:
+  void finish(int) {
+    int size = &g_conf.debug_after - &g_conf.debug;
+    memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+    dout(0) << "debug_after flipping debug settings" << endl;
+  }
+};
+
+
+int main(int argc, char **argv) 
+{
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+
+  parse_config_options(args);
+
+  if (g_conf.kill_after) 
+    g_timer.add_event_after(g_conf.kill_after, new C_Die);
+  if (g_conf.debug_after) 
+    g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+
+  assert(args.size() == 1);
+  char *dev = args[0];
+  cerr << "dev " << dev << endl;
+
+  // who am i?   peek at superblock!
+  OSDSuperblock sb;
+  ObjectStore *store = new Ebofs(dev);
+  bufferlist bl;
+  store->mount();
+  int r = store->read(object_t(0,0), 0, sizeof(sb), bl);
+  if (r < 0) {
+    cerr << "couldn't read superblock object on " << dev << endl;
+    exit(0);
+  }
+  bl.copy(0, sizeof(sb), (char*)&sb);
+  store->umount();
+  delete store;
+
+  cout << "osd fs says i am osd" << sb.whoami << endl;
+
+  // load monmap
+  bl.clear();
+  int fd = ::open(".ceph_monmap", O_RDONLY);
+  assert(fd >= 0);
+  struct stat st;
+  ::fstat(fd, &st);
+  bufferptr bp(st.st_size);
+  bl.append(bp);
+  ::read(fd, (void*)bl.c_str(), bl.length());
+  ::close(fd);
+  
+  MonMap *monmap = new MonMap;
+  monmap->decode(bl);
+
+  // start up network
+  rank.set_namer(monmap->get_inst(0).addr);
+  rank.start_rank();
+
+  // start osd
+  Messenger *m = rank.register_entity(MSG_ADDR_OSD(sb.whoami));
+  assert(m);
+  OSD *osd = new OSD(sb.whoami, m, monmap, dev);
+  osd->init();
+
+  // wait
+  rank.wait();
+
+  // done
+  delete osd;
+
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/crush/BinaryTree.h b/branches/sage/cephmds2/crush/BinaryTree.h
new file mode 100644
index 0000000000000..4f8524bf4ddce
--- /dev/null
+++ b/branches/sage/cephmds2/crush/BinaryTree.h
@@ -0,0 +1,271 @@
+#ifndef __crush_BINARYTREE_H
+#define __crush_BINARYTREE_H
+
+#include <cassert>
+#include <iostream>
+#include <map>
+#include <vector>
+//#include <set>
+using namespace std;
+
+#include "include/buffer.h"
+
+namespace crush {
+
+  class BinaryTree {
+  private:
+    // tree def
+    int             root_node;       // 0 for empty tree.
+    int             alloc;
+    vector<int>     node_nested;     // all existing nodes in this map
+    vector<float>   node_weight;     // and this one
+    vector<int>     node_complete;   // only nodes with all possible children
+
+  public:
+    BinaryTree() : root_node(0), alloc(0) {}
+    
+    void _encode(bufferlist& bl) {
+      bl.append((char*)&root_node, sizeof(root_node));
+      bl.append((char*)&alloc, sizeof(alloc));
+      ::_encode(node_nested, bl);
+      ::_encode(node_weight, bl);
+      ::_encode(node_complete, bl);
+    }
+    void _decode(bufferlist& bl, int& off) {
+      bl.copy(off, sizeof(root_node), (char*)&root_node);
+      off += sizeof(root_node);
+      bl.copy(off, sizeof(alloc), (char*)&alloc);
+      off += sizeof(alloc);
+      ::_decode(node_nested, bl, off);
+      ::_decode(node_weight, bl, off);
+      ::_decode(node_complete, bl, off);
+    }
+
+    // accessors
+    bool  empty() const { return root_node == 0; }
+    bool  exists(int n) const { return n < alloc && node_nested[n]; }
+    int   nested(int n) const { return exists(n) ? node_nested[n]:0; }
+    float weight(int n) const { return exists(n) ? node_weight[n]:0; }
+    bool  complete(int n) const { return exists(n) ? node_complete[n]:false; }
+
+    int   root() const { return root_node; }
+    
+    void   realloc(int n) {
+        /*
+        while (alloc <= n) {
+          node_nested.push_back(0);
+          node_weight.push_back(0);
+          node_complete.push_back(0);
+          alloc++;
+        }
+        */
+      if (alloc <= n) {
+        int add = n - alloc + 1;
+        node_nested.insert(node_nested.end(), add, 0);
+        node_weight.insert(node_weight.end(), add, 0);
+        node_complete.insert(node_complete.end(), add, 0);
+        alloc = n+1;
+      }
+    }
+
+    // tree navigation
+    bool terminal(int n) const { return n & 1; }  // odd nodes are leaves.
+    int height(int n) const {
+      assert(n);
+      int h = 0;
+      while ((n & 1) == 0) {
+        assert(n > 0);
+        h++; n = n >> 1;
+      }
+      return h;
+    }
+    int left(int n) const { 
+      int h = height(n);
+      //cout << "left of " << n << " is " << (n - (1 << h)) << endl;
+      return n - (1 << (h-1));
+    }
+    int right(int n) const {
+      int h = height(n);
+      //cout << "right of " << n << " is " << (n + (1 << h)) << endl;
+      return n + (1 << (h-1));
+    }
+    bool on_right(int n, int h = -1) const { 
+      if (h < 0) h = height(n);
+      return n & (1 << (h+1)); 
+    }
+    bool on_left(int n) const { return !on_right(n); }
+    int parent(int n) const {
+      int h = height(n);
+      if (on_right(n, h))
+        return n - (1<<h);
+      else
+        return n + (1<<h);
+    }
+    
+    // modifiers
+    void adjust_node_weight(int n, float w) {
+      assert(exists(n));
+      node_weight[n] += w;
+     
+      int p = n;
+      while (p != root_node) {
+        p = parent(p);
+        node_weight[p] += w;
+      }
+    }
+
+    void remove_node(int n) {
+      assert(exists(n));
+      
+      // erase node
+      node_nested[n] = 0;
+      node_weight[n] = 0;
+
+      // adjust parents (!complete, -weight)
+      int p = n;
+      while (p != root_node) {
+        p = parent(p);
+
+        node_complete[p] = 0;
+        node_weight[p] = weight(left(p)) + weight(right(p));
+        node_nested[p]--;
+
+        if (nested(p) == 0) {
+          node_weight[p] = 0;
+          node_nested[p] = 0;
+        }
+      }
+      
+      // hose root?
+      while (!terminal(root_node) &&
+             (nested(left(root_node)) == 0 ||
+             nested(right(root_node)) == 0)) {
+        // root now one child..
+        node_weight[root_node] = 0;
+        node_nested[root_node] = 0;
+        if (nested(left(root_node)) == 0)
+          root_node = right(root_node);
+        else 
+          root_node = left(root_node);
+      }
+
+      if (terminal(root_node) && 
+          nested(root_node) == 0) {
+        // empty!
+        node_weight[root_node] = 0;
+        node_nested[root_node] = 0;
+        root_node = 0;
+      }
+
+    }
+
+    int add_node_root(float w) {
+      return add_node(w, true);
+    }
+    
+    int add_node(float w, bool force_root=false) {
+      int n;
+      if (!root_node) {
+        // empty tree!
+        root_node = n = 1;
+      } else {
+        // existing tree.
+        // expand tree?
+        if (force_root || complete(root_node)) {
+          // add new root
+          int newroot = parent(root_node);
+          realloc(newroot);
+          node_weight[newroot] = node_weight[root_node];
+          node_nested[newroot] = nested(root_node);
+
+          // go right or left?
+          if (left(newroot) == root_node)
+            n = right(newroot);
+          else
+            n = left(newroot);
+          root_node = newroot;
+
+          // then go left until terminal
+          while (!terminal(n))
+            n = left(n);
+        }
+        else {
+          // tree isn't complete.
+          n = root_node;
+          while (!terminal(n)) {
+            if (!exists(left(n)) || !complete(left(n))) {
+              // left isn't complete
+              n = left(n);
+            } else {
+              assert(!exists(right(n)) || !complete(right(n)));
+              // right isn't complete
+              n = right(n);
+            }
+          }
+        }
+      }
+      
+      // create at n
+      //cout << "creating " << n << endl;
+      realloc(n);
+      node_weight[n] = w;
+      node_nested[n] = 1;
+      node_complete[n] = 1;
+
+      // ancestors: create, adjust weight, complete as appropriate
+      int p = n;
+      while (p != root_node) {
+        p = parent(p);
+        realloc(p);
+
+        // complete?
+        if (!complete(p) &&
+            complete(left(p)) && 
+            complete(right(p))) 
+          node_complete[p] = 1;
+        
+        // weight (and implicitly create)
+        node_weight[p] += w;
+        node_nested[p]++;
+      }
+
+      return n;
+
+    }
+    
+
+  };
+
+
+  // print it out
+  inline void print_binary_tree_node(ostream& out, const BinaryTree& tree, int n, int i) {
+    for (int t=i; t>0; t--) out << "  ";
+    if (tree.root() == n)
+      out << "root  ";
+    else {
+      if (tree.on_left(n))
+        out << "left  ";
+      else
+        out << "right ";
+    }
+    out << n << " : nested " << tree.nested(n) << "   weight " << tree.weight(n);
+    if (tree.complete(n)) out << "  complete";
+    out << endl;
+    if (!tree.terminal(n)) {
+      if (tree.exists(tree.left(n)))
+        print_binary_tree_node(out, tree, tree.left(n), i+2);
+      if (tree.exists(tree.right(n)))
+        print_binary_tree_node(out, tree, tree.right(n), i+2);
+    }
+  }
+  
+  inline ostream& operator<<(ostream& out, const BinaryTree& tree) {
+    if (tree.empty()) 
+      return out << "tree is empty";
+    print_binary_tree_node(out, tree, tree.root(), 0);    
+    return out;
+  }
+  
+}
+
+#endif
diff --git a/branches/sage/cephmds2/crush/Bucket.h b/branches/sage/cephmds2/crush/Bucket.h
new file mode 100644
index 0000000000000..cdae5bfce8ae4
--- /dev/null
+++ b/branches/sage/cephmds2/crush/Bucket.h
@@ -0,0 +1,618 @@
+#ifndef __crush_BUCKET_H
+#define __crush_BUCKET_H
+
+#include "BinaryTree.h"
+#include "Hash.h"
+
+#include <list>
+#include <vector>
+#include <map>
+#include <set>
+using namespace std;
+
+#include <math.h>
+
+#include "include/buffer.h"
+
+namespace crush {
+
+
+  const int CRUSH_BUCKET_UNIFORM = 1;
+  const int CRUSH_BUCKET_TREE = 2;
+  const int CRUSH_BUCKET_LIST = 3;
+  const int CRUSH_BUCKET_STRAW = 4;
+
+  /** abstract bucket **/
+  class Bucket {
+  protected:
+    int         id;
+    int         parent;
+    int         type;
+    float       weight;
+
+  public:
+    Bucket(int _type,
+           float _weight) :
+      id(0), parent(0),
+      type(_type),
+      weight(_weight) { }
+
+    Bucket(bufferlist& bl, int& off) {
+      bl.copy(off, sizeof(id), (char*)&id);
+      off += sizeof(id);
+      bl.copy(off, sizeof(parent), (char*)&parent);
+      off += sizeof(parent);
+      bl.copy(off, sizeof(type), (char*)&type);
+      off += sizeof(type);
+      bl.copy(off, sizeof(weight), (char*)&weight);
+      off += sizeof(weight);
+    }
+
+    virtual ~Bucket() { }
+    
+    virtual const char *get_bucket_type() const = 0;
+    virtual bool is_uniform() const = 0;
+
+    int          get_id() const { return id; } 
+    int          get_type() const { return type; }
+    float        get_weight() const { return weight; }
+    int          get_parent() const { return parent; }
+    virtual int  get_size() const = 0;
+
+    void         set_id(int i) { id = i; }
+    void         set_parent(int p) { parent = p; }
+    void         set_weight(float w)  { weight = w; }
+
+    virtual void get_items(vector<int>& i) const = 0;
+    virtual float get_item_weight(int item) const = 0;
+    virtual void add_item(int item, float w, bool back=false) = 0;
+    virtual void adjust_item_weight(int item, float w) = 0;
+    virtual void set_item_weight(int item, float w) {
+      adjust_item_weight(item, w - get_item_weight(item));
+    }
+
+    virtual int choose_r(int x, int r, Hash& h) const = 0;
+
+    virtual void _encode(bufferlist& bl) = 0;
+  };
+
+
+
+
+  /** uniform bucket **/
+  class UniformBucket : public Bucket {    
+  protected:
+  public:
+    vector<int> items;
+    int    item_type;
+    float  item_weight;
+
+    // primes
+    vector<unsigned> primes;
+
+    int get_prime(int j) const {
+      return primes[ j % primes.size() ];
+    }
+    void make_primes() {
+      if (items.empty()) return;
+
+      //cout << "make_primes " << get_id() << " " << items.size() << endl;
+      Hash h(123+get_id());
+      primes.clear();
+
+      // start with odd number > num_items
+      unsigned x = items.size() + 1;             // this is the minimum!
+      x += h(items.size()) % (3*items.size());  // bump it up some
+      x |= 1;                               // make it odd
+
+      while (primes.size() < items.size()) {
+        unsigned j;
+        for (j=2; j*j<=x; j++) 
+          if (x % j == 0) break;
+        if (j*j > x) {
+          primes.push_back(x);
+          //cout << "prime " << x << endl;
+        }
+        x += 2;
+      }
+    }
+
+  public:
+    UniformBucket(int _type, int _item_type) :
+      Bucket(_type, 0),
+      item_type(_item_type) { }
+    UniformBucket(int _type, int _item_type,
+                  float _item_weight, vector<int>& _items) :
+      Bucket(_type, _item_weight*_items.size()),
+      item_type(_item_type),
+      item_weight(_item_weight) {
+      items = _items;
+      make_primes();
+    }
+
+    UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+      bl.copy(off, sizeof(item_type), (char*)&item_type);
+      off += sizeof(item_type);
+      bl.copy(off, sizeof(item_weight), (char*)&item_weight);
+      off += sizeof(item_weight);
+      ::_decode(items, bl, off);
+      make_primes();
+    }
+
+    void _encode(bufferlist& bl) {
+      char t = CRUSH_BUCKET_UNIFORM;
+      bl.append((char*)&t, sizeof(t));
+      bl.append((char*)&id, sizeof(id));
+      bl.append((char*)&parent, sizeof(parent));
+      bl.append((char*)&type, sizeof(type));
+      bl.append((char*)&weight, sizeof(weight));
+
+      bl.append((char*)&item_type, sizeof(item_type));
+      bl.append((char*)&item_weight, sizeof(item_weight));
+
+      ::_encode(items, bl);
+    }
+
+    const char *get_bucket_type() const { return "uniform"; }
+    bool is_uniform() const { return true; }
+
+    int get_size() const { return items.size(); }
+
+    // items
+    void get_items(vector<int>& i) const {
+      i = items;
+    }
+    int get_item_type() const { return item_type; }
+    float get_item_weight(int item) const { return item_weight; }
+
+    void add_item(int item, float w, bool back=false) {
+      if (items.empty())
+        item_weight = w;
+      items.push_back(item);
+      weight += item_weight;
+      make_primes();
+    }
+
+    void adjust_item_weight(int item, float w) {
+      assert(0);
+    }
+
+    int choose_r(int x, int r, Hash& hash) const {
+      //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl;
+      //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
+      
+      unsigned v = hash(x, get_id());// % get_size();
+      unsigned p = get_prime( hash(get_id(), x) );  // choose a prime based on hash(x, get_id(), 2)
+      unsigned s = (x + v + (r+1)*p) % get_size();
+      return items[s];
+    }
+
+  };
+
+
+
+
+  
+  // list bucket.. RUSH_P sorta
+  
+  class ListBucket : public Bucket {
+  protected:
+    list<int>        items;
+    list<float>      item_weight;
+    list<float>      sum_weight;
+    
+  public:
+    ListBucket(int _type) : Bucket(_type, 0) { }
+
+    ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+      ::_decode(items, bl, off);
+      ::_decode(item_weight, bl, off);
+      ::_decode(sum_weight, bl, off);
+    }
+
+    void _encode(bufferlist& bl) {
+      char t = CRUSH_BUCKET_LIST;
+      bl.append((char*)&t, sizeof(t));
+      bl.append((char*)&id, sizeof(id));
+      bl.append((char*)&parent, sizeof(parent));
+      bl.append((char*)&type, sizeof(type));
+      bl.append((char*)&weight, sizeof(weight));
+
+      ::_encode(items, bl);
+      ::_encode(item_weight, bl);
+      ::_encode(sum_weight, bl);
+    }
+
+    const char *get_bucket_type() const { return "list"; }
+    bool        is_uniform() const { return false; }
+
+    int get_size() const { return items.size(); }
+
+    void get_items(vector<int>& i) const {
+      for (list<int>::const_iterator it = items.begin();
+           it != items.end();
+           it++) 
+        i.push_back(*it);
+    }
+    float get_item_weight(int item) const {
+      list<int>::const_iterator i = items.begin();
+      list<float>::const_iterator w = item_weight.begin();
+      while (i != items.end()) {
+        if (*i == item) return *w;
+        i++; w++;
+      }
+      assert(0);
+      return 0;
+    }
+
+    void add_item(int item, float w, bool back=false) {
+      if (back) {
+        items.push_back(item);
+        item_weight.push_back(w);
+        sum_weight.clear();
+        float s = 0.0;
+        for (list<float>::reverse_iterator i = item_weight.rbegin();
+             i != item_weight.rend();
+             i++) {
+          s += *i;
+          sum_weight.push_front(s);
+        }
+        weight += w;
+        assert(weight == s);
+      } else {
+        items.push_front(item);
+        item_weight.push_front(w);
+        weight += w;
+        sum_weight.push_front(weight);
+      }
+    }
+
+    void adjust_item_weight(int item, float dw) {
+      // find it
+      list<int>::iterator p = items.begin();
+      list<float>::iterator pw = item_weight.begin();
+      list<float>::iterator ps = sum_weight.begin();
+
+      while (*p != item) {
+        *ps += dw;
+        p++; pw++; ps++;  // next!
+        assert(p != items.end());
+      }
+
+      assert(*p == item);
+      *pw += dw;
+      *ps += dw;
+    }
+
+    
+    int choose_r(int x, int r, Hash& h) const {
+      //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl;
+
+      list<int>::const_iterator p = items.begin();
+      list<float>::const_iterator pw = item_weight.begin();
+      list<float>::const_iterator ps = sum_weight.begin();
+
+      while (p != items.end()) {
+        const int item = *p;
+        const float iw = *pw;
+        const float tw = *ps;
+        const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0;
+        //cout << "item " << item << "  iw = " << iw << "  tw = " << tw << "  f = " << f << endl;
+        if (f < iw) {
+          //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl;
+          return item;
+        }
+        p++; pw++; ps++;  // next!
+      }
+      assert(0);
+      return 0;
+    }    
+
+
+  };
+
+
+
+
+  // mixed bucket, based on RUSH_T type binary tree
+  
+  class TreeBucket : public Bucket {
+  protected:
+    //vector<float>  item_weight;
+
+    //  public:
+    BinaryTree     tree;
+    map<int,int>   node_item;     // node id -> item
+    vector<int>    node_item_vec; // fast version of above
+    map<int,int>   item_node;     // item -> node id
+    map<int,float> item_weight;
+
+  public:
+    TreeBucket(int _type) : Bucket(_type, 0) { }
+    
+    TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+      tree._decode(bl, off);
+      
+      ::_decode(node_item, bl, off);
+      ::_decode(node_item_vec, bl, off);
+      ::_decode(item_node, bl, off);
+      ::_decode(item_weight, bl, off);
+    }
+
+    void _encode(bufferlist& bl) {
+      char t = CRUSH_BUCKET_TREE;
+      bl.append((char*)&t, sizeof(t));
+      bl.append((char*)&id, sizeof(id));
+      bl.append((char*)&parent, sizeof(parent));
+      bl.append((char*)&type, sizeof(type));
+      bl.append((char*)&weight, sizeof(weight));
+
+      tree._encode(bl);
+
+      ::_encode(node_item, bl);
+      ::_encode(node_item_vec, bl);
+      ::_encode(item_node, bl);
+      ::_encode(item_weight, bl);
+    }
+
+    const char *get_bucket_type() const { return "tree"; }
+    bool        is_uniform() const { return false; }
+
+    int get_size() const { return node_item.size(); }
+
+    // items
+    void get_items(vector<int>& i) const {
+      for (map<int,int>::const_iterator it = node_item.begin();
+           it != node_item.end();
+           it++) 
+        i.push_back(it->second);    
+    }
+    float get_item_weight(int i) const { 
+      assert(item_weight.count(i));
+      return ((map<int,float>)item_weight)[i]; 
+    }
+
+
+    void add_item(int item, float w, bool back=false) {
+      item_weight[item] = w;
+      weight += w;
+
+      unsigned n = tree.add_node(w);
+      node_item[n] = item;
+      item_node[item] = n;
+
+      while (node_item_vec.size() <= n) 
+        node_item_vec.push_back(0);
+      node_item_vec[n] = item;
+    }
+    
+    void adjust_item_weight(int item, float dw) {
+      // adjust my weight
+      weight += dw;
+      item_weight[item] += dw;
+
+      // adjust tree weights
+      tree.adjust_node_weight(item_node[item], dw);
+    }
+    
+    int choose_r(int x, int r, Hash& h) const {
+      //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl;
+      int n = tree.root();
+      while (!tree.terminal(n)) {
+        // pick a point in [0,w)
+        float w = tree.weight(n);
+        float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0;
+
+        // left or right?
+        int l = tree.left(n);
+        if (tree.exists(l) && 
+            f < tree.weight(l))
+          n = l;
+        else
+          n = tree.right(n);
+      }
+      //assert(node_item.count(n));
+      //return ((map<int,int>)node_item)[n];
+      return node_item_vec[n];
+    }
+  };
+
+
+
+
+
+  // straw bucket.. new thing!
+  
+  class StrawBucket : public Bucket {
+  protected:
+    map<int, float>  item_weight;
+    map<int, float>  item_straw;
+
+    list<int>   _items;
+    list<float> _straws;
+
+  public:
+    StrawBucket(int _type) : Bucket(_type, 0) { }
+
+    StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+      ::_decode(item_weight, bl, off);
+      calc_straws();
+    }
+
+    void _encode(bufferlist& bl) {
+      char t = CRUSH_BUCKET_TREE;
+      bl.append((char*)&t, sizeof(t));
+      bl.append((char*)&id, sizeof(id));
+      bl.append((char*)&parent, sizeof(parent));
+      bl.append((char*)&type, sizeof(type));
+      bl.append((char*)&weight, sizeof(weight));
+
+      ::_encode(item_weight, bl);
+    }
+
+    const char *get_bucket_type() const { return "straw"; }
+    bool is_uniform() const { return false; }
+
+    int get_size() const { return item_weight.size(); }
+
+
+    // items
+    void get_items(vector<int>& i) const {
+      for (map<int,float>::const_iterator it = item_weight.begin();
+           it != item_weight.end();
+           it++) 
+        i.push_back(it->first);
+    }
+    float get_item_weight(int item) const {
+      assert(item_weight.count(item));
+      return ((map<int,float>)item_weight)[item];
+    }
+
+    void add_item(int item, float w, bool back=false) {
+      item_weight[item] = w;
+      weight += w;
+      calc_straws();
+    }
+
+    void adjust_item_weight(int item, float dw) {
+      //cout << "adjust " << item << " " << dw << endl;
+      weight += dw;
+      item_weight[item] += dw;
+      calc_straws();
+    }
+    
+    
+    /* calculate straw lengths.
+       this is kind of ugly.  not sure if there's a closed form way to calculate this or not!    
+     */
+    void calc_straws() {
+      //cout << get_id() << ": calc_straws ============" << endl;
+
+      item_straw.clear();
+      _items.clear();
+      _straws.clear();
+
+      // reverse sort by weight; skip zero weight items
+      map<float, set<int> > reverse;
+      for (map<int, float>::iterator p = item_weight.begin();
+           p != item_weight.end();
+           p++) {
+        //cout << get_id() << ":" << p->first << " " << p->second << endl;
+        if (p->second > 0) {
+          //p->second /= minw;
+          reverse[p->second].insert(p->first);
+        }
+      }
+
+      /* 1:2:7 
+         item_straw[0] = 1.0;
+         item_straw[1] = item_straw[0]*sqrt(1.0/.6);
+         item_straw[2] = item_straw[1]*2.0;
+      */
+
+      // work from low to high weights
+      float straw = 1.0;
+      float numleft = item_weight.size();
+      float wbelow = 0.0;
+      float lastw = 0.0;
+      
+      map<float, set<int> >::iterator next = reverse.begin();
+      //while (next != reverse.end()) {
+      while (1) {
+        //cout << "hi " << next->first << endl;
+        map<float, set<int> >::iterator cur = next;
+        
+        // set straw length for this set
+        for (set<int>::iterator s = cur->second.begin();
+             s != cur->second.end();
+             s++) {
+          item_straw[*s] = straw;
+          //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl;
+          _items.push_back(*s);
+          _straws.push_back(straw);
+        }
+        
+        next++;
+        if (next == reverse.end()) break;
+        
+        wbelow += (cur->first-lastw) * numleft;
+        //cout << "wbelow " << wbelow << endl;
+        
+        numleft -= 1.0 * (float)cur->second.size();
+        //cout << "numleft now " << numleft << endl;
+        
+        float wnext = numleft * (next->first - cur->first);
+        //cout << "wnext " << wnext << endl;
+        
+        float pbelow = wbelow / (wbelow+wnext);
+        //cout << "pbelow " << pbelow << endl;
+        
+        straw *= pow((double)(1.0/pbelow), (double)1.0/numleft);
+        
+        lastw = cur->first;
+      }
+      //cout << "============" << endl;
+    }
+
+    int choose_r(int x, int r, Hash& h) const {
+      //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl;
+
+      float high_draw = -1;
+      int high = 0;
+
+      list<int>::const_iterator pi = _items.begin();
+      list<float>::const_iterator ps = _straws.begin();
+      while (pi != _items.end()) {
+        const int item = *pi;
+        const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0;
+        const float straw = *ps * rnd;
+        
+        if (high_draw < 0 ||
+            straw > high_draw) {
+          high = *pi;
+          high_draw = straw;
+        }
+
+        pi++;
+        ps++;
+      }
+      return high;
+    }    
+  };
+
+
+
+
+
+  inline Bucket* decode_bucket(bufferlist& bl, int& off) {
+    char t;
+    bl.copy(off, sizeof(t), (char*)&t);
+    off += sizeof(t);
+
+    switch (t) {
+    case CRUSH_BUCKET_UNIFORM:
+      return new UniformBucket(bl, off);
+    case CRUSH_BUCKET_LIST:
+      return new ListBucket(bl, off);
+    case CRUSH_BUCKET_TREE:
+      return new TreeBucket(bl, off);
+    case CRUSH_BUCKET_STRAW:
+      return new StrawBucket(bl, off);
+    default:
+      assert(0);
+    }
+    return 0;
+  }
+
+
+
+}
+
+
+
+
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/crush/Hash.h b/branches/sage/cephmds2/crush/Hash.h
new file mode 100644
index 0000000000000..cd3bb0a02cda6
--- /dev/null
+++ b/branches/sage/cephmds2/crush/Hash.h
@@ -0,0 +1,287 @@
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+#define hashmix(a,b,c) \
+        a=a-b;  a=a-c;  a=a^(c>>13); \
+        b=b-c;  b=b-a;  b=b^(a<<8);  \
+        c=c-a;  c=c-b;  c=c^(b>>13); \
+        a=a-b;  a=a-c;  a=a^(c>>12); \
+        b=b-c;  b=b-a;  b=b^(a<<16); \
+        c=c-a;  c=c-b;  c=c^(b>>5);  \
+        a=a-b;  a=a-c;  a=a^(c>>3); \
+        b=b-c;  b=b-a;  b=b^(a<<10); \
+        c=c-a;  c=c-b;  c=c^(b>>15); 
+
+namespace crush {
+  
+  class Hash {
+    int seed;
+
+  public:
+    int get_seed() { return seed; }
+    void set_seed(int s) { seed = s; }
+
+    Hash(int s) {
+      unsigned int hash = 1315423911;
+      int x = 231232;
+      int y = 1232;
+      hashmix(s, x, hash);
+      hashmix(y, s, hash);
+      seed = s;
+    }
+
+    inline int operator()(int a) {
+      unsigned int hash = seed ^ a;
+      int b = a;
+      int x = 231232;
+      int y = 1232;
+      hashmix(b, x, hash);
+      hashmix(y, a, hash);
+      return (hash & 0x7FFFFFFF);
+    }
+
+    inline int operator()(int a, int b) {
+      unsigned int hash = seed ^ a ^ b;
+      int x = 231232;
+      int y = 1232;
+      hashmix(a, b, hash);
+      hashmix(x, a, hash);
+      hashmix(b, y, hash);
+      return (hash & 0x7FFFFFFF);
+    }
+
+    inline int operator()(int a, int b, int c) {
+      unsigned int hash = seed ^ a ^ b ^ c;
+      int x = 231232;
+      int y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, x, hash);
+      hashmix(y, a, hash);
+      hashmix(b, x, hash);
+      hashmix(y, c, hash);
+      return (hash & 0x7FFFFFFF);
+    }
+
+    inline int operator()(int a, int b, int c, int d) {
+      unsigned int hash = seed ^a ^ b ^ c ^ d;
+      int x = 231232;
+      int y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, d, hash);
+      hashmix(a, x, hash);
+      hashmix(y, b, hash);
+      hashmix(c, x, hash);
+      hashmix(y, d, hash);
+      return (hash & 0x7FFFFFFF);
+    }
+
+    inline int operator()(int a, int b, int c, int d, int e) {
+      unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e;
+      int x = 231232;
+      int y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, d, hash);
+      hashmix(e, x, hash);
+      hashmix(y, a, hash);
+      hashmix(b, x, hash);
+      hashmix(y, c, hash);
+      hashmix(d, x, hash);
+      hashmix(y, e, hash);
+      return (hash & 0x7FFFFFFF);
+    }
+  };
+
+}
+
+
+
+#if 0
+
+
+      //return myhash(a) ^ seed;
+      return myhash(a, seed);
+    }
+    int operator()(int a, int b) {
+      //return myhash( myhash(a) ^ myhash(b) ^ seed );
+      return myhash(a, b, seed);
+    }
+    int operator()(int a, int b, int c) {
+      //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed );
+      return myhash(a, b, c, seed);
+    }
+    int operator()(int a, int b, int c, int d) {
+      //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed );
+      return myhash(a, b, c, d, seed);
+    }
+
+      // ethan's rush hash?
+      if (0) 
+        return (n ^ 0xdead1234) * (884811920 * 3  + 1);
+
+      if (1) {
+
+        // before
+        hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+        hashmix(a, b, hash);
+        n = n >> 8;
+        hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+        hashmix(a, b, hash);
+        n = n >> 8;
+        hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+        hashmix(a, b, hash);
+        n = n >> 8;
+        hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+        hashmix(a, b, hash);
+        n = n >> 8;
+
+        //return hash;
+        return (hash & 0x7FFFFFFF);
+      }
+
+      // JS
+      //  a little better than RS
+      //  + jenkin's mixing thing (which sucks on its own but helps tons here)
+      //  best so far
+      if (1) {
+        unsigned int hash = 1315423911;
+        int a = 231232;
+        int b = 1232;
+        
+        for(unsigned int i = 0; i < 4; i++)
+          {
+            hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+            hashmix(a, b, hash);
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+      
+      // Robert jenkins' 96 bit mix
+      //  sucks
+      if (0) {
+        int c = n;
+        int a = 12378912;
+        int b = 2982827;
+        a=a-b;  a=a-c;  a=a^(c>>13); 
+        b=b-c;  b=b-a;  b=b^(a<<8);  
+        c=c-a;  c=c-b;  c=c^(b>>13); 
+        a=a-b;  a=a-c;  a=a^(c>>12); 
+        b=b-c;  b=b-a;  b=b^(a<<16); 
+        c=c-a;  c=c-b;  c=c^(b>>5);  
+        a=a-b;  a=a-c;  a=a^(c>>3); 
+        b=b-c;  b=b-a;  b=b^(a<<10); 
+        c=c-a;  c=c-b;  c=c^(b>>15); 
+        return c;
+      }
+      // robert jenkins 32-bit
+      //  sucks
+      if (0) {
+        n += (n << 12);
+        n ^= (n >> 22);
+        n += (n << 4);
+        n ^= (n >> 9);
+        n += (n << 10);
+        n ^= (n >> 2);
+        n += (n << 7);
+        n ^= (n >> 12);
+        return n;
+      }
+
+      // djb2
+      if (0) {
+        unsigned int hash = 5381;
+        for (int i=0; i<4; i++) {
+          hash = ((hash << 5) + hash) + ((n&255) ^ 123);
+          n = n >> 8;
+        }
+        return hash;
+      }
+
+
+      // SDBM
+      if (1) {
+        unsigned int hash = 0;
+        
+        for(unsigned int i = 0; i < 4; i++)
+          {
+            hash = (n&255) + (hash << 6) + (hash << 16) - hash;
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+      // PJW
+      //  horrid
+      if (0) {
+        unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
+        unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);
+        unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);
+        unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
+        unsigned int hash              = 0;
+        unsigned int test              = 0;
+        
+        for(unsigned int i = 0; i < 4; i++)
+          {
+            hash = (hash << OneEighth) + (n&255);
+            
+            if((test = hash & HighBits)  != 0)
+              {
+                hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
+              }
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+      // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes.
+      if (0) {
+        unsigned int b    = 378551;
+        unsigned int a    = 63689;
+        unsigned int hash = 0;
+        
+        for(unsigned int i=0; i<4; i++)
+          {
+            hash = hash * a + (n&0xff);
+            a    = a * b;
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+      // DJB
+      //  worse than rs
+      if (0) {
+        unsigned int hash = 5381;
+        
+        for(unsigned int i = 0; i < 4; i++)
+          {
+            hash = ((hash << 5) + hash) + (n&255);
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+      // AP
+      //  even worse
+      if (1) {
+        unsigned int hash = 0;
+        
+        for(unsigned int i = 0; i < 4; i++)
+          {
+            hash ^= ((i & 1) == 0) ? (  (hash <<  7) ^ (n&255) ^ (hash >> 3)) :
+              (~((hash << 11) ^ (n&255) ^ (hash >> 5)));
+            n = n >> 8;
+          }
+        
+        return (hash & 0x7FFFFFFF);
+      }
+
+
+#endif
diff --git a/branches/sage/cephmds2/crush/crush.h b/branches/sage/cephmds2/crush/crush.h
new file mode 100644
index 0000000000000..b1e245f1b6af6
--- /dev/null
+++ b/branches/sage/cephmds2/crush/crush.h
@@ -0,0 +1,521 @@
+#ifndef __crush_CRUSH_H
+#define __crush_CRUSH_H
+
+#include <iostream>
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "Bucket.h"
+
+#include "include/buffer.h"
+
+
+namespace crush {
+
+
+  // *** RULES ***
+
+  class RuleStep {
+  public:
+    int         cmd;
+    vector<int> args;
+
+    RuleStep(int c) : cmd(c) {}
+    RuleStep(int c, int a) : cmd(c) {
+      args.push_back(a);
+    }
+    RuleStep(int c, int a, int b) : cmd(c) {
+      args.push_back(a);
+      args.push_back(b);
+    }
+    RuleStep(int o, int a, int b, int c) : cmd(o) {
+      args.push_back(a);
+      args.push_back(b);
+      args.push_back(c);
+    }
+
+    void _encode(bufferlist& bl) {
+      bl.append((char*)&cmd, sizeof(cmd));
+      ::_encode(args, bl);
+    }
+    void _decode(bufferlist& bl, int& off) {
+      bl.copy(off, sizeof(cmd), (char*)&cmd);
+      off += sizeof(cmd);
+      ::_decode(args, bl, off);
+    }
+  };
+
+
+  // Rule operations
+  const int CRUSH_RULE_TAKE = 0;
+  const int CRUSH_RULE_CHOOSE = 1;         // first n by default
+  const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
+  const int CRUSH_RULE_CHOOSE_INDEP = 2;
+  const int CRUSH_RULE_EMIT = 3;
+
+  class Rule {
+  public:
+    vector< RuleStep > steps;
+
+    void _encode(bufferlist& bl) {
+      int n = steps.size();
+      bl.append((char*)&n, sizeof(n));
+      for (int i=0; i<n; i++)
+        steps[i]._encode(bl);
+    }
+    void _decode(bufferlist& bl, int& off) {
+      steps.clear();
+      int n;
+      bl.copy(off, sizeof(n), (char*)&n);
+      off += sizeof(n);
+      for (int i=0; i<n; i++) {
+        steps.push_back(RuleStep(0));
+        steps[i]._decode(bl, off);
+      }
+    }
+  };
+
+
+
+
+  // *** CRUSH ***
+
+  class Crush {
+  protected:
+    map<int, Bucket*>  buckets;
+    int bucketno;
+    Hash h;
+
+	hash_map<int, int> parent_map;  // what bucket each leaf/bucket lives in
+
+  public:
+    map<int, Rule>     rules;
+
+    //map<int,int> collisions;
+    //map<int,int> bumps;    
+
+    void _encode(bufferlist& bl) {
+      // buckets
+      int n = buckets.size();
+      bl.append((char*)&n, sizeof(n));
+      for (map<int, Bucket*>::const_iterator it = buckets.begin();
+           it != buckets.end();
+           it++) {
+        bl.append((char*)&it->first, sizeof(it->first));
+        it->second->_encode(bl);
+      }
+      bl.append((char*)&bucketno, sizeof(bucketno));
+
+      // hash
+      int s = h.get_seed();
+      bl.append((char*)&s, sizeof(s));
+
+      //::_encode(out, bl);
+      //::_encode(overload, bl);
+      
+      // rules
+      n = rules.size();
+      bl.append((char*)&n, sizeof(n));
+      for(map<int, Rule>::iterator it = rules.begin();
+          it != rules.end();
+          it++) {
+        bl.append((char*)&it->first, sizeof(it->first));
+        it->second._encode(bl);
+      }
+        
+    }
+
+    void _decode(bufferlist& bl, int& off) {
+      int n;
+      bl.copy(off, sizeof(n), (char*)&n);
+      off += sizeof(n);
+      for (int i=0; i<n; i++) {
+        int bid;
+        bl.copy(off, sizeof(bid), (char*)&bid);
+        off += sizeof(bid);
+        Bucket *b = decode_bucket(bl, off);
+        buckets[bid] = b;
+      }
+      bl.copy(off, sizeof(bucketno), (char*)&bucketno);
+      off += sizeof(bucketno);
+
+      int s;
+      bl.copy(off, sizeof(s), (char*)&s);
+      off += sizeof(s);
+      h.set_seed(s);
+
+      //::_decode(out, bl, off);
+      //::_decode(overload, bl, off);
+
+      // rules
+      bl.copy(off, sizeof(n), (char*)&n);
+      off += sizeof(n);
+      for (int i=0; i<n; i++) {
+        int r;
+        bl.copy(off, sizeof(r), (char*)&r);
+        off += sizeof(r);
+        rules[r]._decode(bl,off);
+      }
+
+	  // index
+	  build_parent_map();
+    }
+
+	void build_parent_map() {
+	  parent_map.clear();
+
+	  // index every bucket
+	  for (map<int, Bucket*>::iterator bp = buckets.begin();
+		   bp != buckets.end();
+		   ++bp) {
+		// index bucket items
+		vector<int> items;
+		bp->second->get_items(items);
+		for (vector<int>::iterator ip = items.begin();
+			 ip != items.end();
+			 ++ip)
+		  parent_map[*ip] = bp->first;
+	  }
+	}
+	 
+
+
+  public:
+    Crush(int seed=123) : bucketno(-1), h(seed) {}
+    ~Crush() {
+      // hose buckets
+      for (map<int, Bucket*>::iterator it = buckets.begin();
+           it != buckets.end();
+           it++) {
+        delete it->second;
+      }
+    }
+
+    int print(ostream& out, int root, int indent=0) {
+      for (int i=0; i<indent; i++) out << " ";
+      Bucket *b = buckets[root];
+      assert(b);
+      out << b->get_weight() << "\t" << b->get_id() << "\t";
+      for (int i=0; i<indent; i++) out << " ";
+      out << b->get_bucket_type() << ": ";
+
+      vector<int> items;
+      b->get_items(items);
+
+      if (buckets.count(items[0])) {
+        out << endl;
+        for (unsigned i=0; i<items.size(); i++)
+          print(out, items[i], indent+1);
+      } else {
+        out << "[";
+        for (unsigned i=0; i<items.size(); i++) {
+          if (i) out << " ";
+          out << items[i];
+        }
+        out << "]";
+      }
+      return 0;
+    }
+
+
+    int add_bucket( Bucket *b ) {
+      int n = bucketno;
+      bucketno--;
+      b->set_id(n);
+      buckets[n] = b;
+      return n;
+    }
+
+    void add_item(int parent, int item, float w, bool back=false) {
+      // add item
+      assert(!buckets[parent]->is_uniform());
+      Bucket *p = buckets[parent];
+      
+      p->add_item(item, w, back);
+
+      // set item's parent
+      Bucket *n = buckets[item];
+      if (n)
+        n->set_parent(parent);
+
+      // update weights
+      while (buckets.count(p->get_parent())) {
+        int child = p->get_id();
+        p = buckets[p->get_parent()];
+        p->adjust_item_weight(child, w);
+      }
+    }
+
+
+    /*
+    this is a hack, fix me!  weights should be consistent throughout hierarchy!
+    
+     */
+    void set_bucket_weight(int item, float w) {
+      Bucket *b = buckets[item];
+      float adj = w - b->get_weight();
+
+      while (buckets.count(b->get_parent())) {
+        Bucket *p = buckets[b->get_parent()];
+        p->adjust_item_weight(b->get_id(), adj);
+        b = p;
+      }
+    }
+
+
+    /*
+     * choose numrep distinct items of type type
+     */
+    void choose(int x,
+                int numrep,
+                int type,
+                Bucket *inbucket,
+                vector<int>& outvec,
+                bool firstn,
+                set<int>& outset, map<int,float>& overloadmap,
+				bool forcefeed=false,
+				int forcefeedval=-1) {
+      int off = outvec.size();
+
+      // for each replica
+      for (int rep=0; rep<numrep; rep++) {
+        int outv = -1;                   // my result
+        
+		// forcefeed?
+		if (forcefeed) {
+		  forcefeed = false;
+		  outvec.push_back(forcefeedval);
+		  continue;
+		}
+
+        // keep trying until we get a non-out, non-colliding item
+        int ftotal = 0;
+        bool skip_rep = false;
+
+        while (1) {
+          // start with the input bucket
+          Bucket *in = inbucket;
+          
+          // choose through intervening buckets
+          int flocal = 0;
+          bool retry_rep = false;
+
+          while (1) {
+            // r may be twiddled to (try to) avoid past collisions
+            int r = rep;
+            if (in->is_uniform()) {
+              // uniform bucket; be careful!
+              if (firstn || numrep >= in->get_size()) {
+                // uniform bucket is too small; just walk thru elements
+                r += ftotal;                    // r' = r + f_total (first n)
+              } else {
+                // make sure numrep is not a multple of bucket size
+                int add = numrep*flocal;        // r' = r + n*f_local
+                if (in->get_size() % numrep == 0) {
+                  add += add/in->get_size();         // shift seq once per pass through the bucket
+                }
+                r += add;
+              }
+            } else {
+              // mixed bucket; just make a distinct-ish r sequence
+              if (firstn)
+                r += ftotal;          // r' = r + f_total
+              else
+                r += numrep * flocal; // r' = r + n*f_local
+            }
+            
+            // choose
+            outv = in->choose_r(x, r, h);                     
+            
+            // did we get the type we want?
+            int itemtype = 0;          // 0 is terminal type
+            Bucket *newin = 0;         // remember bucket we hit
+            if (in->is_uniform()) {
+              itemtype = ((UniformBucket*)in)->get_item_type();
+            } else {
+              if (buckets.count(outv)) {  // another bucket
+                newin = buckets[outv];
+                itemtype = newin->get_type();
+              } 
+            }
+            if (itemtype == type) { // this is what we want!
+              // collision?
+              bool collide = false;
+              for (int prep=0; prep<rep; prep++) {
+                if (outvec[off+prep] == outv) {
+                  collide = true;
+                  break;
+                }
+              }
+
+              // ok choice?
+              bool bad = false;
+              if (type == 0 && outset.count(outv)) 
+                bad = true;
+              if (overloadmap.count(outv)) {
+                float f = (float)(h(x, outv) % 1000) / 1000.0;
+                if (f > overloadmap[outv])
+                  bad = true;
+              }
+
+              if (collide || bad) {
+                ftotal++;
+                flocal++;
+                
+                if (collide && flocal < 3) 
+                  continue;  // try locally a few times!
+                
+                if (ftotal >= 10) {
+                  // ok fine, just ignore dup.  FIXME.
+                  skip_rep = true;
+                  break;
+                }
+                
+                retry_rep = true;
+              }
+
+              break;  // ok then!
+            }
+
+            // next
+            in = newin;
+          }
+          
+          if (retry_rep) continue;  // try again
+
+          break;
+        }
+
+        // skip this rep? (e.g. too many collisions, we give up)
+        if (skip_rep) continue; 
+
+        // output this value
+        outvec.push_back(outv);
+      } // for rep
+
+      // double check!
+      if (0) {
+        for (unsigned i=1; i<outvec.size(); i++) 
+          for (unsigned j=0; j<i; j++)
+            assert(outvec[i] != outvec[j]);
+      }
+    }
+
+
+    void do_rule(Rule& rule, int x, vector<int>& result,
+                 set<int>& outset, map<int,float>& overloadmap,
+				 int forcefeed=-1) {
+      //int numresult = 0;
+      result.clear();
+
+	  // determine hierarchical context for first.
+	  list<int> force_stack;
+	  if (forcefeed >= 0) {
+		int t = forcefeed;
+		while (1) {
+		  force_stack.push_front(t);
+		  if (parent_map.count(t) == 0) break;  // reached root, presumably.
+		  //cout << " " << t << " parent is " << parent_map[t] << endl;
+		  t = parent_map[t];
+		}
+	  }
+
+      // working vector
+      vector<int> w;   // working variable
+
+      // go through each statement
+      for (vector<RuleStep>::iterator pc = rule.steps.begin();
+           pc != rule.steps.end();
+           pc++) {
+        // move input?
+        
+        // do it
+        switch (pc->cmd) {
+        case CRUSH_RULE_TAKE:
+          {
+            const int arg = pc->args[0];
+            //cout << "take " << arg << endl;
+
+			if (!force_stack.empty()) {
+			  int forceval = force_stack.front();
+			  force_stack.pop_front();
+			  assert(arg == forceval);
+			}
+
+            w.clear();
+            w.push_back(arg);
+          }
+          break;
+          
+        case CRUSH_RULE_CHOOSE_FIRSTN:
+        case CRUSH_RULE_CHOOSE_INDEP:
+          {
+            const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
+            const int numrep = pc->args[0];
+            const int type = pc->args[1];
+
+            //cout << "choose " << numrep << " of type " << type << endl;
+
+            assert(!w.empty());
+
+            // reset output
+            vector<int> out;
+
+            // forcefeeding?
+			bool forcing = false;
+			int forceval;
+			if (!force_stack.empty()) {
+			  forceval = force_stack.front();
+			  force_stack.pop_front();
+			  //cout << "priming out with " << forceval << endl;
+			  forcing = true;
+			}
+
+            // do each row independently
+            for (vector<int>::iterator i = w.begin();
+                 i != w.end();
+                 i++) {
+              assert(buckets.count(*i));
+              Bucket *b = buckets[*i];
+			  choose(x, numrep, type, b, out, firstn,
+					 outset, overloadmap,
+					 forcing,
+					 forceval);
+			  forcing = false;  // only once
+            } // for inrow
+            
+            // put back into w
+            w.swap(out);
+            out.clear();
+          }
+          break;
+
+        case CRUSH_RULE_EMIT:
+          {
+            for (unsigned i=0; i<w.size(); i++)
+              result.push_back(w[i]);
+            //result[numresult++] = w[i];
+            w.clear();
+          }
+          break;
+
+        default:
+          assert(0);
+        }
+      }
+
+    }
+
+
+  };
+
+}
+
+#endif
diff --git a/branches/sage/cephmds2/crush/test/bucket_movement.cc b/branches/sage/cephmds2/crush/test/bucket_movement.cc
new file mode 100644
index 0000000000000..6be17356cb64c
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/bucket_movement.cc
@@ -0,0 +1,166 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	//if (bad)
+	// cout << "bad set " << x << ": " << v << endl;
+	
+	placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int n, float f, int buckettype)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+  int ndisks = 0;
+
+  // bucket
+  Bucket *b;
+  if (buckettype == 0)
+	b = new TreeBucket(1);
+  else if (buckettype == 1 || buckettype == 2)
+	b = new ListBucket(1);
+  else if (buckettype == 3)
+	b = new StrawBucket(1);
+  else if (buckettype == 4)
+	b = new UniformBucket(0,0);
+
+  for (int i=0; i<n; i++)
+	b->add_item(ndisks++,1);
+
+  c.add_bucket(b);
+  int root = b->get_id();
+  
+  //c.print(cout,root);
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 1000;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, vector<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  
+  // ORIGINAL
+  place(c, rule, numpg, numrep, placement1);
+  
+  int olddisks = ndisks;
+
+  // add item
+  if (buckettype == 2) {
+	// start over!
+	ndisks = 0;
+	b = new ListBucket(1);
+	for (int i=0; i<=n; i++)
+	  b->add_item(ndisks++,1);
+	c.add_bucket(b);
+	root = b->get_id();
+
+	rule.steps.clear();
+	rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+	rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+	rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  }
+  else
+	b->add_item(ndisks++, 1);
+
+
+  // ADDED
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  int moved = 0;
+  for (int x=1; x<=numpg; x++) 
+	if (placement1[x] != placement2[x]) 
+	  for (int j=0; j<numrep; j++)
+		if (placement1[x][j] != placement2[x][j]) 
+		  moved++;
+
+  int total = numpg*numrep;
+  float actual = (float)moved / (float)(total);
+  float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+  float fac = actual/ideal;
+  //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+  cout << "\t" << fac;
+  return fac;
+}
+
+
+int main() 
+{
+  //cout << "//  " << depth << ",  modifydepth " << modifydepth << ",  branching " << branching << ",  disks " << n << endl;
+  cout << "n\ttree\tlhead\tltail\tstraw\tuniform" << endl;
+
+  //for (int s=2; s<=64; s+= (s<4?1:(s<16?2:4))) {
+  for (int s=2; s<=64; s+= (s<4?1:4)) {
+	float f = 1.0 / (float)s;
+	//cout << f << "\t" << s;
+	cout << s;
+	for (int buckettype=0; buckettype<5; buckettype++)
+	  testmovement(s, f, buckettype);
+	cout << endl;
+  }
+}
+
diff --git a/branches/sage/cephmds2/crush/test/bucket_variance.cc b/branches/sage/cephmds2/crush/test/bucket_variance.cc
new file mode 100644
index 0000000000000..d2f553fb3a730
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/bucket_variance.cc
@@ -0,0 +1,199 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	//Bucket *b = new MixedBucket(h+1);
+	Bucket *b = new StrawBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+float go(int dep) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  if (0) {
+	for (int d=0; d<dep; d++)
+	  wid.push_back(10);
+  }
+  if (1) {
+	if (dep == 0) 
+	  wid.push_back(1000);
+	if (dep == 1) {
+	  wid.push_back(1);
+	  wid.push_back(1000);
+	}
+	if (dep == 2) {
+	  wid.push_back(5);
+	  wid.push_back(5);
+	  wid.push_back(8);
+	  wid.push_back(5);
+	}	
+  }
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  if (0) {
+	MixedBucket *b = new MixedBucket(1);
+	for (int i=0; i<10000; i++)
+	  b->add_item(ndisks++, 10);
+	root = c.add_bucket(b);
+  }
+  if (0) {
+	vector<int> disks;
+	for (int i=0; i<10000; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+	Hash h(123);
+	b->make_primes(h);
+	root = c.add_bucket(b);
+  }
+  
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 100000;
+  int times = place / numpg;
+  if (!times) times = 1;
+
+  cout << "#looping " << times << " times" << endl;
+  
+  float tvar = 0;
+  int tvarnum = 0;
+
+  int x = 0;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	for (int xx=1; xx<numpg; xx++) {
+	  x++;
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  if (bad)
+		cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	
+	/*
+	  for (int i=0; i<ocount.size(); i++) {
+	  cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	*/
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	
+	float avg = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  avg += ocount[i];
+	avg /= ocount.size();
+	float var = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  var += (ocount[i] - avg) * (ocount[i] - avg);
+	var /= ocount.size();
+	
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	//cout << avg << "\t";
+	
+	tvar += var;
+	tvarnum++;
+  }
+
+  tvar /= tvarnum;
+
+  //cout << "total variance " << tvar << endl;
+
+  return tvar;
+}
+
+
+int main() 
+{
+  for (int d=0; d<=2; d++) {
+	float var = go(d);
+	//cout << "## depth = " << d << endl;
+	cout << d << "\t" << var << "\t" << sqrt(var) << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/cluster_movement.cc b/branches/sage/cephmds2/crush/test/cluster_movement.cc
new file mode 100644
index 0000000000000..aa1418a834ce3
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/cluster_movement.cc
@@ -0,0 +1,217 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int buckettype = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	//Bucket *b = new TreeBucket(h+1);
+	//Bucket *b = new ListBucket(h+1);
+	//Bucket *b = new StrawBucket(h+1);
+	Bucket *b;
+	if (buckettype == 0)
+	  b = new TreeBucket(h+1);
+	else if (buckettype == 1 || buckettype == 2)
+	  b = new ListBucket(h+1);
+	else if (buckettype == 3)
+	  b = new StrawBucket(h+1);
+
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	
+	placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add, int modifydepth)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(udisks);
+  for (int d=1; d<depth; d++)
+	wid.push_back(branching);
+
+  map< int, list<Bucket*> > buckets;
+
+  root = make_hierarchy(c, wid, buckets, ndisks);
+  
+  //c.print(cout,root);
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, vector<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  
+  // ORIGINAL
+  place(c, rule, numpg, numrep, placement1);
+  
+  int olddisks = ndisks;
+
+  // add disks
+  //cout << " adding " << add << " disks" << endl;
+  vector<int> disks;
+  for (int i=0; i<add; i++)
+	disks.push_back(ndisks++);
+  UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+  //b->make_primes(h);
+
+  //Bucket *o = buckets[2].back();
+  Bucket *o;
+  if (buckettype == 2)
+	o = buckets[modifydepth].front();
+  else
+	o = buckets[modifydepth].back();
+
+  c.add_bucket(b);
+  //cout << " adding under " << o->get_id() << endl;
+  c.add_item(o->get_id(), b->get_id(), b->get_weight());
+  //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+  //newbucket = b;
+
+
+  // ADDED
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  int moved = 0;
+  for (int x=1; x<=numpg; x++) 
+	if (placement1[x] != placement2[x]) 
+	  for (int j=0; j<numrep; j++)
+		if (placement1[x][j] != placement2[x][j]) 
+		  moved++;
+
+  int total = numpg*numrep;
+  float actual = (float)moved / (float)(total);
+  float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+  float fac = actual/ideal;
+  //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+  cout << "\t" << fac;
+  return fac;
+}
+
+
+int main() 
+{
+  
+  int udisks = 10;
+  int add = udisks;
+
+  //int depth = 3;
+  //int branching = 25;
+  int depth = 4;
+  int branching = 9;
+
+  int modifydepth = 1;
+  int bfac = (int)(sqrt((double)branching));
+  int n = (int)(udisks * pow((float)branching, (float)depth-1));
+
+  cout << "// depth " << depth << ",  modifydepth " << modifydepth << ",  branching " << branching << ",  disks " << n << endl;
+  cout << "n\ttree\tlhead\tltail\tstraw" << endl;
+  for (int add = udisks; add <= n; add *= bfac) {
+	cout << add;
+	for (buckettype=0; buckettype<4; buckettype++)
+	  testmovement(depth, branching, udisks, add, modifydepth);
+	cout << endl;
+  }
+}
+
diff --git a/branches/sage/cephmds2/crush/test/cluster_movement_remove.cc b/branches/sage/cephmds2/crush/test/cluster_movement_remove.cc
new file mode 100644
index 0000000000000..4a6560ecdc38b
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/cluster_movement_remove.cc
@@ -0,0 +1,229 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int buckettype = 2;  // 0 = mixed, 1 = linear, 2 = straw
+
+int big_one_skip = 255;
+int big_one_size;
+Bucket *big_one = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	
+	int s = wid[h];
+	if (big_one_skip > 0) 
+	  big_one_skip--;	  
+	if (!big_one_skip && !big_one)
+	  s = big_one_size;
+
+
+	for (int i=0; i<s; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	if (!big_one_skip && !big_one) big_one = b;
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	Bucket *b;
+	if (buckettype == 0)
+	  b = new TreeBucket(h+1);
+	else if (buckettype == 1)
+	  b = new ListBucket(h+1);
+	else if (buckettype == 2)
+	  b = new StrawBucket(h+1);
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	if (bad)
+	  cout << "bad set " << x << ": " << v << endl;
+	
+	placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(udisks);
+  for (int d=1; d<depth; d++)
+	wid.push_back(branching + ((d==2)?1:0));
+
+  map< int, list<Bucket*> > buckets;
+
+  big_one_size = add;
+  big_one = 0;
+  
+  //cout << "making tree" << endl;
+  root = make_hierarchy(c, wid, buckets, ndisks);
+  
+  //c.print(cout, root);
+
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, vector<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  int olddisks = ndisks;
+
+
+  place(c, rule, numpg, numrep, placement1);
+  
+  if (1) {
+	// remove disks
+	assert(big_one);
+	c.adjust_item(big_one->get_id(), 0);
+  }
+
+  int newdisks = ndisks - add;
+
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  int moved = 0;
+  for (int x=1; x<=numpg; x++) 
+	if (placement1[x] != placement2[x]) 
+	  for (int j=0; j<numrep; j++)
+		if (placement1[x][j] != placement2[x][j]) 
+		  moved++;
+
+  int total = numpg*numrep;
+  float actual = (float)moved / (float)(total);
+  //float ideal = (float)(newdisks-olddisks) / (float)(ndisks);
+  float ideal = (float)(olddisks-newdisks) / (float)(olddisks);
+  float fac = actual/ideal;
+  cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+  return fac;
+}
+
+
+int main() 
+{
+  
+  int udisks = 10;
+  int ndisks = 10;
+  int depth = 4;
+  int branching = 9;
+  int add = udisks;
+
+  //cout << "\t" << n;
+  //  cout << endl;
+
+  buckettype = 2;  // 0 = tree, 1 = linear, 2 = straw
+
+  int n = udisks * pow((float)branching, (float)depth-1);
+  for (int add = udisks; add <= n; add *= 3) {
+	big_one_skip = 0;
+	big_one_skip = 9;
+	testmovement(depth, branching, udisks, add);
+  }
+  
+  /*
+  cout << "##" << endl;
+  for (map<int, map<float,float> >::iterator i = r.begin();
+	   i != r.end();
+	   i++) {
+	cout << i->first;
+	for (map<float,float>::iterator j = i->second.begin();
+		 j != i->second.end();
+		 j++)
+	  cout << "\t" << j->first << "\t" << j->second;
+	cout << endl;
+  }
+  */
+}
+
diff --git a/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc b/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc
new file mode 100644
index 0000000000000..90cc197c24f65
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc
@@ -0,0 +1,218 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int buckettype = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	//Bucket *b = new TreeBucket(h+1);
+	//Bucket *b = new ListBucket(h+1);
+	//Bucket *b = new StrawBucket(h+1);
+	Bucket *b;
+	if (buckettype == 0)
+	  b = new TreeBucket(h+1);
+	else if (buckettype == 1 || buckettype == 2)
+	  b = new ListBucket(h+1);
+	else if (buckettype == 3)
+	  b = new StrawBucket(h+1);
+
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	
+	placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add, int modifydepth)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(udisks);
+  for (int d=1; d<depth; d++)
+	wid.push_back(branching);
+
+  map< int, list<Bucket*> > buckets;
+
+  root = make_hierarchy(c, wid, buckets, ndisks);
+  
+  //c.print(cout,root);
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, vector<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  
+  // ORIGINAL
+  place(c, rule, numpg, numrep, placement1);
+  
+  int olddisks = ndisks;
+
+  // add disks
+  //cout << " adding " << add << " disks" << endl;
+  vector<int> disks;
+  for (int i=0; i<add; i++)
+	disks.push_back(ndisks++);
+  UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+  //b->make_primes(h);
+
+  //Bucket *o = buckets[2].back();
+  Bucket *o;
+  if (buckettype == 2)
+	o = buckets[modifydepth].front();
+  else
+	o = buckets[modifydepth].back();
+
+  c.add_bucket(b);
+  //cout << " adding under " << o->get_id() << endl;
+  c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2);
+  //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+  //newbucket = b;
+
+
+  // ADDED
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  int moved = 0;
+  for (int x=1; x<=numpg; x++) 
+	if (placement1[x] != placement2[x]) 
+	  for (int j=0; j<numrep; j++)
+		if (placement1[x][j] != placement2[x][j]) 
+		  moved++;
+
+  int total = numpg*numrep;
+  float actual = (float)moved / (float)(total);
+  float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+  float fac = actual/ideal;
+  //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+  cout << "\t" << fac;
+  return fac;
+}
+
+
+int main() 
+{
+  
+  int udisks = 10;
+  int add = udisks;
+
+  //int depth = 3;
+  //int branching = 25;
+  int depth = 2;
+  int branching = 9*9*9;
+
+  int modifydepth = 1;
+  int bfac = (int)(sqrt((double)branching));
+  bfac = 3;
+  int n = (int)(udisks * pow((float)branching, (float)depth-1));
+
+  cout << "// depth " << depth << ",  modifydepth " << modifydepth << ",  branching " << branching << ",  disks " << n << endl;
+  cout << "n\ttree\tlhead\tltail\tstraw" << endl;
+  for (int add = udisks; add <= n; add *= bfac) {
+	cout << add;
+	for (buckettype=0; buckettype<3; buckettype++)
+	  testmovement(depth, branching, udisks, add, modifydepth);
+	cout << endl;
+  }
+}
+
diff --git a/branches/sage/cephmds2/crush/test/creeping_failure.cc b/branches/sage/cephmds2/crush/test/creeping_failure.cc
new file mode 100644
index 0000000000000..ce27535e61dc8
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/creeping_failure.cc
@@ -0,0 +1,276 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include "../../common/Clock.h"
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Clock g_clock;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	float w = 10;//((ndisks-1)/100+1)*10;
+	UniformBucket *b = new UniformBucket(1, 0, w, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b = new TreeBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+
+float go(int dep, int failpc) 
+{
+  Hash h(73232313);
+
+  //int overloadcutoff = (int)((float)10000.0 / (float)utilization);
+
+  //cout << "util " << utilization << " cutoff " << overloadcutoff << endl;
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  for (int d=0; d<dep; d++)
+	wid.push_back(10);
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+
+  //cout << ndisks << " disks" << endl;
+
+
+  int numf = ndisks * failpc / 100;
+
+
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+  int pg_per_base = 100;//20;
+  int pg_med = 10*pg_per_base;
+  int pg_per = pg_per_base*5.5;//100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 1000000;
+  int times = place / numpg;
+  if (!times) times = 1;
+  
+
+  //cout << "looping " << times << " times" << endl;
+  
+  float tavg[10];
+  float tvar[10];
+  for (int j=0;j<10;j++) {
+	tvar[j] = 0;
+	tavg[j] = 0;
+  }
+  int tvarnum = 0;
+  float trvar = 0.0;
+
+  float overloadsum = 0.0;
+  float adjustsum = 0.0;
+  float afteroverloadsum = 0.0;
+  float aslowdown = 0.0;
+  int chooses = 0;
+  int xs = 1;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	c.out.clear();
+
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+	
+	utime_t t1a = g_clock.now();
+	for (int x=xs; x<numpg+xs; x++) {
+	  c.do_rule(rule, x, v);
+	  //chooses += numrep;
+	  for (int i=0; i<v.size(); i++) {
+		//if (v[i] >= ndisks) cout << "v[i] " << i << " is " << v[i] << "  .. x = " << x << endl;
+		//assert(v[i] < ndisks);
+		ocount[v[i]]++;
+	  }
+	}
+	utime_t t1b = g_clock.now();
+
+	// add in numf failed disks
+	for (int f = 0; f < numf; f++) {
+	  int d = rand() % ndisks;
+	  while (c.out.count(d)) d = rand() % ndisks;
+	  c.out.insert(d);
+	}
+
+	utime_t t3a = g_clock.now();
+	for (int x=xs; x<numpg+xs; x++) {
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  for (int i=0; i<v.size(); i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+	  }
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	xs += numpg;
+
+	utime_t t3b = g_clock.now();
+
+	t1b -= t1a;
+	double t1 = (double)t1b;
+	t3b -= t3a;
+	double t3 = (double)t3b;
+	double slowdown = t3/t1;
+	//cout << "slowdown " << slowdown << endl;
+	aslowdown += slowdown;
+
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	// stair var calc
+	int n = ndisks/10;
+	float avg[10];
+	float var[10];
+	for (int i=0;i<10;i++) {
+	  int s = n*i;
+	  avg[i] = 0.0;
+	  int nf = 0;
+	  for (int j=0; j<n; j++) {
+		if (c.out.count(j+s)) { nf++; continue; }
+		avg[i] += ocount[j+s];
+	  }
+	  avg[i] /= (n-nf);//ocount.size();
+	  var[i] = 0.0;
+	  for (int j=0; j<n; j++) {
+		if (c.out.count(j+s)) continue;
+		var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+	  }
+	  var[i] /= (n-nf);//ocount.size();
+
+	  tvar[i] += var[i];
+	  tavg[i] += avg[i];
+	}
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	
+	tvarnum++;
+
+	// flat var calc
+	int na = ndisks - numf;  // num active
+	float ravg = 0.0;
+	for (int i=0;i<ndisks;i++) {
+	  if (c.out.count(i)) continue;
+	  ravg += ocount[i];
+	}
+	ravg /= (float)na;
+	float rvar = 0.0;
+	for (int i=0; i<ndisks; i++) {
+	  if (c.out.count(i)) continue;
+	  rvar += (ravg-(float)ocount[i])*(ravg-(float)ocount[i]);
+	}
+	rvar /= (float)na;
+
+	trvar += rvar;
+  }
+
+
+  trvar /= (float)tvarnum;
+
+  //overloadsum /= tvarnum;
+  //adjustsum /= tvarnum;
+  float avar = 0.0;
+  for (int j=0;j<10;j++) {
+	tvar[j] /= tvarnum;
+	tavg[j] /= tvarnum;
+	avar += tvar[j];
+  }
+  avar /= 10;
+  avar = sqrt(avar);
+  avar /= /*5.5 **/ (float)pg_per_base;
+  //afteroverloadsum /= tvarnum;
+  aslowdown /= tvarnum;
+
+  //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+  //float crate = (float) collisions / (float)chooses;
+  //cout << "collisions: " << c.collisions << endl;
+
+
+  //cout << "total variance " << tvar << endl;
+  //cout << " overlaod " << overloadsum << endl;
+  
+  cout << failpc 
+	   << "\t" << numf 
+	//<< "\t" << adjustsum 
+	//<< "\t" << afteroverloadsum 
+	   << "\t" << aslowdown 
+	   << "\t" << trvar
+	   << "\t" << sqrt(trvar) / (float)pg_per_base
+	   << "\t..\t" << avar 
+	   << "\t-"; 
+
+  for (int i=0;i<10;i++)
+	cout << "\t" << tavg[i] << "\t" << sqrt(tvar[i]);// << "\t" << tvar[i]/tavg[i];
+
+  cout << endl;
+  return tvar[0];
+}
+
+
+int main() 
+{
+  for (int pc = 0; pc < 90; pc += 5) {
+	float var = go(3, pc);
+  }
+  
+
+}
diff --git a/branches/sage/cephmds2/crush/test/creeping_failure_variance.cc b/branches/sage/cephmds2/crush/test/creeping_failure_variance.cc
new file mode 100644
index 0000000000000..c7a65a069d9c3
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/creeping_failure_variance.cc
@@ -0,0 +1,281 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	MixedBucket *b = new MixedBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+Bucket *make_random(Crush& c, int wid, int height, int& ndisks)
+{
+  int w = rand() % (wid-1) + 2;
+
+  if (height == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<w; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	int h = rand() % height + 1;
+	MixedBucket *b = new MixedBucket(h+1);
+	for (int i=0; i<w; i++) {
+	  Bucket *n = make_random(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+
+}
+
+
+float go(int dep, int overloadcutoff) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  for (int d=0; d<dep; d++)
+	wid.push_back(10);
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  if (0) {
+	Bucket *r = make_random(c, 20,  4, ndisks);
+	root = r->get_id();
+	//c.print(cout, root);
+  }
+  if (0) {
+	MixedBucket *b = new MixedBucket(1);
+	for (int i=0; i<10000; i++)
+	  b->add_item(ndisks++, 10);
+	root = c.add_bucket(b);
+  }
+  if (0) {
+	vector<int> disks;
+	for (int i=0; i<10000; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+	Hash h(123);
+	b->make_primes(h);
+	root = c.add_bucket(b);
+  }
+  //cout << ndisks << " disks" << endl;
+  
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 1000000;
+  int times = place / numpg;
+  if (!times) times = 1;
+  
+
+  //cout << "looping " << times << " times" << endl;
+  
+  float tvar = 0;
+  int tvarnum = 0;
+
+  float overloadsum = 0.0;
+  float adjustsum = 0.0;
+  float afteroverloadsum = 0.0;
+  int chooses = 0;
+  int xs = 1;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	c.overload.clear();
+
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  chooses += numrep;
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  if (bad)
+		cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+
+	// overloaded?
+	int overloaded = 0;
+	int adjusted = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  if (ocount[i] > overloadcutoff) 
+		overloaded++;
+
+	  if (ocount[i] > 100+(overloadcutoff-100)/2) {
+		adjusted++;
+		c.overload[i] = 100.0 / (float)ocount[i];
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	  ocount[i] = 0;
+	}
+	//cout << overloaded << " overloaded" << endl;
+	overloadsum += (float)overloaded / (float)ndisks;
+	adjustsum += (float)adjusted / (float)ndisks;
+
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  if (bad)
+		cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	xs += numpg;
+
+	int still = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  if (ocount[i] > overloadcutoff) {
+		still++;
+		//c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	}
+	//if (still) cout << "overload was " << overloaded << " now " << still << endl;
+	afteroverloadsum += (float)still / (float)ndisks;
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	float avg = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  avg += ocount[i];
+	avg /= ocount.size();
+	float var = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  var += (ocount[i] - avg) * (ocount[i] - avg);
+	var /= ocount.size();
+	
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	
+	tvar += var;
+	tvarnum++;
+  }
+
+  overloadsum /= tvarnum;
+  adjustsum /= tvarnum;
+  tvar /= tvarnum;
+  afteroverloadsum /= tvarnum;
+
+  int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+  float crate = (float) collisions / (float)chooses;
+  //cout << "collisions: " << c.collisions << endl;
+
+
+  //cout << "total variance " << tvar << endl;
+  //cout << " overlaod " << overloadsum << endl;
+
+  cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << tvar << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum << "\t" << crate << endl;
+  return tvar;
+}
+
+
+int main() 
+{
+  for (int d=140; d>100; d -= 5) {
+	float var = go(3,d);
+	//cout << "## depth = " << d << endl;
+	//cout << d << "\t" << var << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/depth_variance.cc b/branches/sage/cephmds2/crush/test/depth_variance.cc
new file mode 100644
index 0000000000000..7d60ebaae9501
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/depth_variance.cc
@@ -0,0 +1,185 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b = new TreeBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+float go(int dep) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  if (1) {
+	for (int d=0; d<dep; d++)
+	  wid.push_back(10);
+  }
+  if (0) {
+	if (dep == 0) 
+	  wid.push_back(1000);
+	if (dep == 1) {
+	  wid.push_back(1);
+	  wid.push_back(1000);
+	}
+	if (dep == 2) {
+	  wid.push_back(5);
+	  wid.push_back(5);
+	  wid.push_back(8);
+	  wid.push_back(5);
+	}	
+  }
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 100000;
+  int times = place / numpg;
+  if (!times) times = 1;
+
+  cout << "#looping " << times << " times" << endl;
+  
+  float tvar = 0;
+  int tvarnum = 0;
+  float tavg = 0;
+
+  int x = 0;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	for (int xx=1; xx<numpg; xx++) {
+	  x++;
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	
+	/*
+	  for (int i=0; i<ocount.size(); i++) {
+	  cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	*/
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	
+	float avg = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  avg += ocount[i];
+	avg /= ocount.size();
+	float var = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  var += (ocount[i] - avg) * (ocount[i] - avg);
+	var /= ocount.size();
+	
+	if (times < 10) 
+	  cout << "avg " << avg << "   evar " << sqrt(avg) << "   sd " << sqrt(var) << endl;
+	//cout << avg << "\t";
+	
+	tvar += var;
+	tavg += avg;
+	tvarnum++;
+  }
+
+  tavg /= tvarnum;
+  tvar /= tvarnum;
+
+  cout << "total variance " << sqrt(tvar) << "   expected " << sqrt(tavg) << endl;
+
+  return tvar;
+}
+
+
+int main() 
+{
+  for (int d=2; d<=5; d++) {
+	float var = go(d);
+	//cout << "## depth = " << d << endl;
+	//cout << d << "\t" << var << "\t" << sqrt(var) << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/mixed.cc b/branches/sage/cephmds2/crush/test/mixed.cc
new file mode 100644
index 0000000000000..5666f7be4717c
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/mixed.cc
@@ -0,0 +1,300 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	float w = ((ndisks-1)/100+1)*10;
+	UniformBucket *b = new UniformBucket(1, 0, w, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b = new TreeBucket(h+1);
+	//Bucket *b = new StrawBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+
+float go(int dep, int overloadcutoff) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  for (int d=0; d<dep; d++)
+	wid.push_back(10);
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per_base = 10;
+  int pg_per = pg_per_base*5.5;//100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 100000;
+  int times = place / numpg;
+  if (!times) times = 1;
+  
+
+  //cout << "looping " << times << " times" << endl;
+  
+  float tavg[10];
+  float tvar[10];
+  for (int j=0;j<10;j++) {
+	tvar[j] = 0;
+	tavg[j] = 0;
+  }
+  int tvarnum = 0;
+
+  float overloadsum = 0.0;
+  float adjustsum = 0.0;
+  float afteroverloadsum = 0.0;
+  int chooses = 0;
+  int xs = 1;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	c.overload.clear();
+
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  chooses += numrep;
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  //if (bad)
+	  //cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+
+	// overloaded?
+	int overloaded = 0;
+	int adjusted = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  int target = (i/100+1)*10;
+	  int cutoff = target * overloadcutoff / 100;
+	  int adjoff = target + (cutoff - target)*3/4;
+	  if (ocount[i] > cutoff) 
+		overloaded++;
+
+	  if (ocount[i] > adjoff) {
+		adjusted++;
+		c.overload[i] = (float)target / (float)ocount[i];
+		//cout << "setting overload " << i << " to " << c.overload[i] << endl;
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	  ocount[i] = 0;
+	}
+	//cout << overloaded << " overloaded" << endl;
+	overloadsum += (float)overloaded / (float)ndisks;
+	adjustsum += (float)adjusted / (float)ndisks;
+
+
+
+	if (1) {
+	  // second pass
+	  for (int x=xs; x<numpg+xs; x++) {
+		
+		//cout << H(x) << "\t" << h(x) << endl;
+		c.do_rule(rule, x, v);
+		//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+		
+		bool bad = false;
+		for (int i=0; i<numrep; i++) {
+		  //int d = b.choose_r(x, i, h);
+		  //v[i] = d;
+		  ocount[v[i]]++;
+		  for (int j=i+1; j<numrep; j++) {
+			if (v[i] == v[j]) 
+			  bad = true;
+		  }
+		}
+		
+		//cout << v << "\t" << ocount << endl;
+	  }
+
+	  for (int i=0; i<ocount.size(); i++) {
+		int target = (i/100+1)*10;
+		int cutoff = target * overloadcutoff / 100;
+		int adjoff = cutoff;//target + (cutoff - target)*3/4;
+
+		if (ocount[i] >= adjoff) {
+		  adjusted++;
+		  if (c.overload.count(i) == 0) {
+			c.overload[i] = 1.0;
+			adjusted++;
+		  }
+		  //else cout << "(re)adjusting " << i << endl;
+		  c.overload[i] *= (float)target / (float)ocount[i];
+		  //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+		  //cout << "disk " << i << " has " << ocount[i] << endl;
+		}
+		ocount[i] = 0;
+	  }
+	}
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  //cout << v << "\t" << ocount << endl;
+	}
+	xs += numpg;
+
+	int still = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  int target = (i/100+1)*10;
+	  int cutoff = target * overloadcutoff / 100;
+	  int adjoff = target + (cutoff - target)/3;
+
+	  if (ocount[i] > cutoff) {
+		still++;
+		//c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+		if (c.overload.count(i)) cout << "[adjusted] ";
+		cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	}
+	//if (still) cout << "overload was " << overloaded << " now " << still << endl;
+	afteroverloadsum += (float)still / (float)ndisks;
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	int n = ndisks/10;
+	float avg[10];
+	float var[10];
+	for (int i=0;i<10;i++) {
+	  int s = n*i;
+	  avg[i] = 0.0;
+	  for (int j=0; j<n; j++)
+		avg[i] += ocount[j+s];
+	  avg[i] /= n;//ocount.size();
+	  var[i] = 0.0;
+	  for (int j=0; j<n; j++)
+		var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+	  var[i] /= n;//ocount.size();
+
+	  tvar[i] += var[i];
+	  tavg[i] += avg[i];
+	}
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	
+	tvarnum++;
+  }
+
+  overloadsum /= tvarnum;
+  adjustsum /= tvarnum;
+  for (int j=0;j<10;j++) {
+	tvar[j] /= tvarnum;
+	tavg[j] /= tvarnum;
+  }
+  afteroverloadsum /= tvarnum;
+
+  //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+  //float crate = (float) collisions / (float)chooses;
+  //cout << "collisions: " << c.collisions << endl;
+
+
+  //cout << "total variance " << tvar << endl;
+  //cout << " overlaod " << overloadsum << endl;
+  
+  cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum;
+  for (int i=0;i<10;i++)
+	cout << "\t" << tavg[i] << "\t" << tvar[i];// << "\t" << tvar[i]/tavg[i];
+  cout << endl;
+  return tvar[0];
+}
+
+
+int main() 
+{
+  float var = go(3,200);
+  for (int d=140; d>100; d -= 5) {
+	float var = go(3,d);
+	//cout << "## depth = " << d << endl;
+	//cout << d << "\t" << var << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/movement.cc b/branches/sage/cephmds2/crush/test/movement.cc
new file mode 100644
index 0000000000000..2621f09457fe6
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/movement.cc
@@ -0,0 +1,223 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	MixedBucket *b = new MixedBucket(h+1);
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	if (bad)
+	  cout << "bad set " << x << ": " << v << endl;
+	
+	placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(udisks);
+  for (int d=1; d<depth; d++)
+	wid.push_back(branching);
+
+  map< int, list<Bucket*> > buckets;
+
+  if (1) {
+	root = make_hierarchy(c, wid, buckets, ndisks);
+  }
+  if (0) {
+	MixedBucket *b = new MixedBucket(1);
+	for (int i=0; i<10000; i++)
+	  b->add_item(ndisks++, 10);
+	root = c.add_bucket(b);
+  }
+  if (0) {
+	vector<int> disks;
+	for (int i=0; i<10000; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	Hash h(123);
+	b->make_primes(h);
+	root = c.add_bucket(b);
+  }
+  
+
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, vector<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  place(c, rule, numpg, numrep, placement1);
+  
+  if (1) {
+	// failed
+
+	//for (int i=500; i<1000; i++)
+	//c.failed.insert(i);
+	c.failed.insert(0);
+  }
+
+  int olddisks = ndisks;
+
+  if (1) {
+	int n = udisks;
+	//cout << " adding " << n << " disks" << endl;
+	vector<int> disks;
+	for (int i=0; i<n; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	Hash h(123);
+	b->make_primes(h);
+	Bucket *o = buckets[1].back();
+	c.add_bucket(b);
+	//cout << " adding under " << o->get_id() << endl;
+	c.add_item(o->get_id(), b->get_id(), b->get_weight());
+	//((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+  }
+
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  int moved = 0;
+  for (int x=1; x<=numpg; x++) {
+	if (placement1[x] != placement2[x]) {
+	  for (int j=0; j<numrep; j++)
+		if (placement1[x][j] != placement2[x][j]) 
+		  moved++;
+	  
+	}
+  }
+
+  float f = (float)moved / (float)(numpg*numrep);
+  float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+  float fac = f/ideal;
+  //cout << moved << " moved or " << f << ", ideal " << ideal << ", factor of " << fac <<  endl;
+  return fac;
+}
+
+
+int main() 
+{
+  
+  int udisks = 10;
+  int ndisks = 10;
+  for (int depth = 2; depth <= 4; depth++) {
+	vector<float> v;
+	cout << depth;
+	for (int branching = 3; branching < 16; branching += 1) {
+	  float fac = testmovement(depth, branching, udisks);
+	  v.push_back(fac);
+	int n = udisks * pow((float)branching, (float)depth-1);
+	cout << "\t" << n;
+	  cout << "\t" << fac;
+	}
+	//for (int i=0; i<v.size(); i++)
+	//cout << "\t" << v[i];
+	cout << endl;
+
+  }
+
+}
+
diff --git a/branches/sage/cephmds2/crush/test/movement_failed.cc b/branches/sage/cephmds2/crush/test/movement_failed.cc
new file mode 100644
index 0000000000000..98c34d96e9ac2
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/movement_failed.cc
@@ -0,0 +1,246 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	MixedBucket *b = new MixedBucket(h+1);
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, set<int> >& placement)
+{
+  vector<int> v(numrep);
+  map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	  placement[v[i]].insert(x);
+	}
+	if (bad)
+	  cout << "bad set " << x << ": " << v << endl;
+	
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+  if (0) 
+	for (map<int,int>::iterator it = ocount.begin();
+		 it != ocount.end();
+		 it++) 
+	  cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(udisks);
+  for (int d=1; d<depth; d++)
+	wid.push_back(branching);
+
+  map< int, list<Bucket*> > buckets;
+
+  if (1) {
+	root = make_hierarchy(c, wid, buckets, ndisks);
+  }
+  if (0) {
+	MixedBucket *b = new MixedBucket(1);
+	for (int i=0; i<10000; i++)
+	  b->add_item(ndisks++, 10);
+	root = c.add_bucket(b);
+  }
+  if (0) {
+	vector<int> disks;
+	for (int i=0; i<10000; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	Hash h(123);
+	b->make_primes(h);
+	root = c.add_bucket(b);
+  }
+  
+
+
+  // rule
+  int numrep = 2;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  /*
+  cout << ndisks << " disks, " << endl;
+  cout << pg_per << " pgs per disk" << endl;
+    cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+  */
+  map<int, set<int> > placement1, placement2;
+
+  //c.print(cout, root);
+
+  place(c, rule, numpg, numrep, placement1);
+
+  float over = .5;
+  
+  if (1) {
+	// failed
+
+	//for (int i=500; i<1000; i++)
+	//c.failed.insert(i);
+	//c.failed.insert(0);
+	c.overload[0] = over;
+  }
+
+  int olddisks = ndisks;
+
+
+
+  if (0) {
+	int n = udisks;
+	//cout << " adding " << n << " disks" << endl;
+	vector<int> disks;
+	for (int i=0; i<n; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	Hash h(123);
+	b->make_primes(h);
+	Bucket *o = buckets[1].back();
+	c.add_bucket(b);
+	//cout << " adding under " << o->get_id() << endl;
+	c.add_item(o->get_id(), b->get_id(), b->get_weight());
+	//((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+  }
+
+  //c.print(cout, root);
+  place(c, rule, numpg, numrep, placement2);
+
+  vector<int> moved(ndisks);
+
+  //int moved = 0;
+  for (int d=0; d<ndisks; d++) {
+	for (set<int>::iterator it = placement1[d].begin();
+		 it != placement1[d].end();
+		 it++) {
+	  placement2[d].erase(*it);
+	}
+  }
+
+  float avg = 0;
+  for (int d=0; d<ndisks; d++) {
+	moved[d] = placement2[d].size();
+	avg += moved[d];
+  }
+  avg /= (float)ndisks;
+  float var = 0;
+  for (int d=0; d<ndisks; d++) {
+	var += (moved[d]-avg)*(moved[d]-avg);
+  }
+  var /= (float)ndisks;
+
+  float expected = over * 100.0 / (float)(ndisks-1);
+
+  cout << ndisks << "\t" << expected << "\t" << avg << "\t" << var << endl;
+  /*
+  float f = (float)moved / (float)(numpg*numrep);
+  float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+  float fac = f/ideal;
+  //cout << moved << " moved or " << f << ", ideal " << ideal << ", factor of " << fac <<  endl;
+  return fac;
+  */
+}
+
+
+int main() 
+{
+  
+  int udisks = 10;
+  int ndisks = 10;
+  for (int depth = 2; depth <= 4; depth++) {
+	vector<float> v;
+	cout << depth;
+	for (int branching = 3; branching < 16; branching += 1) {
+	  float fac = testmovement(depth, branching, udisks);
+	  v.push_back(fac);
+	  int n = udisks * pow((float)branching, (float)depth-1);
+	  //cout << "\t" << n;
+	  //cout << "\t" << fac;
+	}
+	//for (int i=0; i<v.size(); i++)
+	//cout << "\t" << v[i];
+	//cout << endl;
+
+  }
+
+}
+
diff --git a/branches/sage/cephmds2/crush/test/overload.cc b/branches/sage/cephmds2/crush/test/overload.cc
new file mode 100644
index 0000000000000..32c667201bca3
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/overload.cc
@@ -0,0 +1,335 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include "../../common/Clock.h"
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Clock g_clock;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	float w = ((ndisks-1)/100+1)*10;
+	UniformBucket *b = new UniformBucket(1, 0, w, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b = new TreeBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+
+float go(int dep, int utilization ) 
+{
+  Hash h(73232313);
+
+  int overloadcutoff = (int)((float)10000.0 / (float)utilization);
+
+  //cout << "util " << utilization << " cutoff " << overloadcutoff << endl;
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  for (int d=0; d<dep; d++)
+	wid.push_back(10);
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+
+  //cout << ndisks << " disks" << endl;
+  
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+  int pg_per_base = 20;
+  int pg_med = 10*pg_per_base;
+  int pg_per = pg_per_base*5.5;//100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 100000;
+  int times = place / numpg;
+  if (!times) times = 1;
+  
+
+  //cout << "looping " << times << " times" << endl;
+  
+  float tavg[10];
+  float tvar[10];
+  for (int j=0;j<10;j++) {
+	tvar[j] = 0;
+	tavg[j] = 0;
+  }
+  int tvarnum = 0;
+
+  float overloadsum = 0.0;
+  float adjustsum = 0.0;
+  float afteroverloadsum = 0.0;
+  float aslowdown = 0.0;
+  int chooses = 0;
+  int xs = 1;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	c.overload.clear();
+
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	
+	utime_t t1a = g_clock.now();
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  chooses += numrep;
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+	  }
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+
+	utime_t t1b = g_clock.now();
+
+	// overloaded?
+	int overloaded = 0;
+	int adjusted = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  int target = (i/100+1)*pg_per_base;
+	  int cutoff = target * overloadcutoff / 100;
+	  int adjoff = target + (cutoff - target)*3/4;
+	  if (ocount[i] > cutoff) 
+		overloaded++;
+
+	  if (ocount[i] > adjoff) {
+		adjusted++;
+		c.overload[i] = (float)target / (float)ocount[i];
+		//cout << "setting overload " << i << " to " << c.overload[i] << endl;
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	  ocount[i] = 0;
+	}
+	//cout << overloaded << " overloaded" << endl;
+	overloadsum += (float)overloaded / (float)ndisks;
+	adjustsum += (float)adjusted / (float)ndisks;
+
+
+
+	// keep adjusting!
+	for (int bla=0; bla<5; bla++) {
+	  utime_t t2a = g_clock.now();
+
+	  // second pass
+	  for (int x=xs; x<numpg+xs; x++) {
+		
+		//cout << H(x) << "\t" << h(x) << endl;
+		c.do_rule(rule, x, v);
+		//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+		
+		for (int i=0; i<numrep; i++) {
+		  //int d = b.choose_r(x, i, h);
+		  //v[i] = d;
+		  ocount[v[i]]++;
+		}
+		
+		//cout << v << "\t" << ocount << endl;
+	  }
+
+	  utime_t t2b = g_clock.now();
+
+	  int numover = 0;
+	  for (int i=0; i<ocount.size(); i++) {
+		int target = (i/100+1)*pg_per_base;
+		int cutoff = target * overloadcutoff / 100;
+		int adjoff = cutoff;//target + (cutoff - target)*3/4;
+
+		if (ocount[i] >= adjoff) {
+		  numover++;
+		  if (c.overload.count(i) == 0) {
+			c.overload[i] = 1.0;
+			adjusted++;
+		  }
+		  //else cout << "(re)adjusting " << i << endl;
+		  c.overload[i] *= (float)target / (float)ocount[i];
+		  //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+		  //cout << "disk " << i << " has " << ocount[i] << endl;
+		}
+		ocount[i] = 0;
+	  }
+	  if (!numover) break;
+	  cout << "readjusting" << endl;
+	}
+
+	utime_t t3a = g_clock.now();
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+	  }
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	xs += numpg;
+
+	utime_t t3b = g_clock.now();
+
+	t1b -= t1a;
+	double t1 = (double)t1b;
+	t3b -= t3a;
+	double t3 = (double)t3b;
+	double slowdown = t3/t1;
+	//cout << "slowdown " << slowdown << endl;
+	aslowdown += slowdown;
+
+	int still = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  int target = (i/100+1)*pg_per_base;
+	  int cutoff = target * overloadcutoff / 100;
+	  //int adjoff = target + (cutoff - target)/3;
+
+	  if (ocount[i] > cutoff) {
+		still++;
+		//c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+		if (c.overload.count(i)) cout << "[adjusted] ";
+		cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	}
+	//if (still) cout << "overload was " << overloaded << " now " << still << endl;
+	afteroverloadsum += (float)still / (float)ndisks;
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	int n = ndisks/10;
+	float avg[10];
+	float var[10];
+	for (int i=0;i<10;i++) {
+	  int s = n*i;
+	  avg[i] = 0.0;
+	  for (int j=0; j<n; j++)
+		avg[i] += ocount[j+s];
+	  avg[i] /= n;//ocount.size();
+	  var[i] = 0.0;
+	  for (int j=0; j<n; j++)
+		var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+	  var[i] /= n;//ocount.size();
+
+	  tvar[i] += var[i];
+	  tavg[i] += avg[i];
+	}
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	
+	tvarnum++;
+  }
+
+  overloadsum /= tvarnum;
+  adjustsum /= tvarnum;
+  float avar = 0.0;
+  for (int j=0;j<10;j++) {
+	tvar[j] /= tvarnum;
+	tavg[j] /= tvarnum;
+	avar += tvar[j];
+  }
+  avar /= 10;
+  avar = sqrt(avar);
+  avar /= 5.5 * (float)pg_per_base;
+  afteroverloadsum /= tvarnum;
+  aslowdown /= tvarnum;
+
+  //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+  //float crate = (float) collisions / (float)chooses;
+  //cout << "collisions: " << c.collisions << endl;
+
+
+  //cout << "total variance " << tvar << endl;
+  //cout << " overlaod " << overloadsum << endl;
+  
+  cout << overloadcutoff << "\t" << utilization 
+	   << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum 
+	   << "\t" << aslowdown << "\t" << avar << "\t-"; 
+
+  for (int i=0;i<10;i++)
+	cout << "\t" << tavg[i] << "\t" << tvar[i];// << "\t" << tvar[i]/tavg[i];
+  cout << endl;
+  return tvar[0];
+}
+
+
+int main() 
+{
+  float var = go(3,50);
+  /*  for (int d=70; d<100; d += 5) {
+	float var = go(3,d);
+	//cout << "## depth = " << d << endl;
+	//cout << d << "\t" << var << endl;
+	}*/
+  go(3,96);
+  go(3,97);
+  go(3,98);
+  go(3,99);
+  
+
+}
diff --git a/branches/sage/cephmds2/crush/test/overload_variance.cc b/branches/sage/cephmds2/crush/test/overload_variance.cc
new file mode 100644
index 0000000000000..b04cae0f2d19d
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/overload_variance.cc
@@ -0,0 +1,281 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	MixedBucket *b = new MixedBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+Bucket *make_random(Crush& c, int wid, int height, int& ndisks)
+{
+  int w = rand() % (wid-1) + 2;
+
+  if (height == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<w; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	int h = rand() % height + 1;
+	MixedBucket *b = new MixedBucket(h+1);
+	for (int i=0; i<w; i++) {
+	  Bucket *n = make_random(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+
+}
+
+
+float go(int dep, int overloadcutoff) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  for (int d=0; d<dep; d++)
+	wid.push_back(10);
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  if (0) {
+	Bucket *r = make_random(c, 20,  4, ndisks);
+	root = r->get_id();
+	//c.print(cout, root);
+  }
+  if (0) {
+	MixedBucket *b = new MixedBucket(1);
+	for (int i=0; i<10000; i++)
+	  b->add_item(ndisks++, 10);
+	root = c.add_bucket(b);
+  }
+  if (0) {
+	vector<int> disks;
+	for (int i=0; i<10000; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+	Hash h(123);
+	b->make_primes(h);
+	root = c.add_bucket(b);
+  }
+  //cout << ndisks << " disks" << endl;
+  
+
+
+  // rule
+  int numrep = 1;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  //cout << ndisks << " disks, " << endl;
+  //cout << pg_per << " pgs per disk" << endl;
+  //  cout << numpg << " logical pgs" << endl;
+  //cout << "numrep is " << numrep << endl;
+
+
+  int place = 1000000;
+  int times = place / numpg;
+  if (!times) times = 1;
+  
+
+  //cout << "looping " << times << " times" << endl;
+  
+  float tvar = 0;
+  int tvarnum = 0;
+
+  float overloadsum = 0.0;
+  float adjustsum = 0.0;
+  float afteroverloadsum = 0.0;
+  int chooses = 0;
+  int xs = 1;
+  for (int t=0; t<times; t++) {
+	vector<int> v(numrep);
+	
+	c.overload.clear();
+
+	for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  chooses += numrep;
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  if (bad)
+		cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+
+	// overloaded?
+	int overloaded = 0;
+	int adjusted = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  if (ocount[i] > overloadcutoff) 
+		overloaded++;
+
+	  if (ocount[i] > 100+(overloadcutoff-100)/2) {
+		adjusted++;
+		c.overload[i] = 100.0 / (float)ocount[i];
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	  ocount[i] = 0;
+	}
+	//cout << overloaded << " overloaded" << endl;
+	overloadsum += (float)overloaded / (float)ndisks;
+	adjustsum += (float)adjusted / (float)ndisks;
+
+
+	for (int x=xs; x<numpg+xs; x++) {
+
+	  //cout << H(x) << "\t" << h(x) << endl;
+	  c.do_rule(rule, x, v);
+	  //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	  
+	  bool bad = false;
+	  for (int i=0; i<numrep; i++) {
+		//int d = b.choose_r(x, i, h);
+		//v[i] = d;
+		ocount[v[i]]++;
+		for (int j=i+1; j<numrep; j++) {
+		  if (v[i] == v[j]) 
+			bad = true;
+		}
+	  }
+	  if (bad)
+		cout << "bad set " << x << ": " << v << endl;
+	  
+	  //cout << v << "\t" << ocount << endl;
+	}
+	xs += numpg;
+
+	int still = 0;
+	for (int i=0; i<ocount.size(); i++) {
+	  if (ocount[i] > overloadcutoff) {
+		still++;
+		//c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+		//cout << "disk " << i << " has " << ocount[i] << endl;
+	  }
+	}
+	//if (still) cout << "overload was " << overloaded << " now " << still << endl;
+	afteroverloadsum += (float)still / (float)ndisks;
+	
+	//cout << "collisions: " << c.collisions << endl;
+	//cout << "r bumps: " << c.bumps << endl;
+	
+	float avg = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  avg += ocount[i];
+	avg /= ocount.size();
+	float var = 0.0;
+	for (int i=0; i<ocount.size(); i++)
+	  var += (ocount[i] - avg) * (ocount[i] - avg);
+	var /= ocount.size();
+	
+	//cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+	
+	tvar += var;
+	tvarnum++;
+  }
+
+  overloadsum /= tvarnum;
+  adjustsum /= tvarnum;
+  tvar /= tvarnum;
+  afteroverloadsum /= tvarnum;
+
+  int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+  float crate = (float) collisions / (float)chooses;
+  //cout << "collisions: " << c.collisions << endl;
+
+
+  //cout << "total variance " << tvar << endl;
+  //cout << " overlaod " << overloadsum << endl;
+
+  cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << tvar << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum << "\t" << crate << endl;
+  return tvar;
+}
+
+
+int main() 
+{
+  for (int d=140; d>100; d -= 5) {
+	float var = go(3,d);
+	//cout << "## depth = " << d << endl;
+	//cout << d << "\t" << var << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/sizes.cc b/branches/sage/cephmds2/crush/test/sizes.cc
new file mode 100644
index 0000000000000..cc5780218210a
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/sizes.cc
@@ -0,0 +1,131 @@
+
+#include "include/types.h"
+#include "include/Distribution.h"
+#include "osd/OSDMap.h"
+
+
+Distribution file_size_distn; //kb
+
+
+list<int> object_queue;
+int max_object_size = 1024*1024*100;  //kb
+
+off_t no;
+
+int get_object()  //kb
+{
+  if (object_queue.empty()) {
+	int max = file_size_distn.sample();
+	no++;
+	int filesize = max/2 + (rand() % 100) * max/200 + 1;
+	//cout << "file " << filesize << endl;
+	while (filesize > max_object_size) {
+	  object_queue.push_back(max_object_size);
+	  filesize -= max_object_size;
+	}
+	object_queue.push_back(filesize);
+  }
+  int s = object_queue.front();
+  object_queue.pop_front();
+  //cout << "object " << s << endl;
+  return s;
+}
+
+void getdist(vector<off_t>& v, float& avg, float& var) 
+{
+  avg = 0.0;
+  for (int i=0; i<v.size(); i++)
+	avg += v[i];
+  avg /= v.size();
+  
+  var = 0.0;
+  for (int i=0; i<v.size(); i++)
+	var += (v[i] - avg) * (v[i] - avg);
+  var /= v.size();
+}
+
+
+void testpgs(int n, // numpg
+			 off_t pggb,
+			 float& avg,
+			 float& var,
+			 off_t& numo
+			 )
+{
+  off_t dist = (off_t)n * 1024LL*1024LL * (off_t)pggb;  //kb
+  vector<off_t> pgs(n);
+  off_t did = 0;
+  
+  no = 0;
+  while (did < dist) {
+	off_t s = get_object();
+	pgs[rand()%n] += s;
+	did += s;
+  }
+  while (!object_queue.empty())
+	pgs[rand()%n] += get_object();
+
+  numo = no;
+  //cout << did/n << endl; 
+
+  //for (int i=0; i<n; i++) cout << pgs[i] << endl;
+
+  getdist(pgs, avg, var);
+  //cout << "avg " << avg << "  var " << var << "  dev " << sqrt(var) << endl;
+ 
+}
+
+
+
+int main()
+{
+  /*
+
+// File Size 
+//cate   count_mean             size_mean       
+1b      -0.5     0.65434375     0        
+1k      0.5      19.0758125     0.00875          
+512K    1.5      35.6566        2.85875
+1M      2.5      27.7271875     25.0084375       
+2M      3.5      16.63503125    20.8046875       
+4M      4.5      106.82384375   296.053125       
+8M      5.5      81.493375      335.77625        
+16M     6.5      14.13553125    185.9775         
+32M     7.5      2.176          52.921875
+256M    8.5      0.655938       47.8066
+512M    9.5      0.1480625      57.83375 
+2G      10.5     0.020125       19.2888 
+  */
+  file_size_distn.add(1, 19.0758125+0.65434375);
+  file_size_distn.add(512, 35.6566);
+  file_size_distn.add(1024, 27.7271875);
+  file_size_distn.add(2*1024, 16.63503125);
+  file_size_distn.add(4*1024, 106.82384375);
+  file_size_distn.add(8*1024, 81.493375);
+  file_size_distn.add(16*1024, 14.13553125);
+  file_size_distn.add(32*1024, 2.176);
+  file_size_distn.add(256*1024, 0.655938);
+  file_size_distn.add(512*1024, 0.1480625);
+  file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+  file_size_distn.normalize();
+
+  
+  for (int pggb = 1; pggb < 16; pggb++) {
+	cout << pggb;
+	for (int max = 1; max <= 1024; max *= 2) {
+	  float avg, var, var2, var3;
+	  off_t no;
+	  max_object_size = max*1024;
+	  testpgs(100, pggb, avg, var, no);
+	  testpgs(100, pggb, avg, var2, no);
+	  testpgs(100, pggb, avg, var3, no);
+	  float dev = sqrt((var+var2+var3)/3.0);
+	  cout << "\t" << no << "\t" << max << "\t" << dev;
+	}
+	cout << endl;
+  }
+
+
+
+
+}
diff --git a/branches/sage/cephmds2/crush/test/smallbucket.cc b/branches/sage/cephmds2/crush/test/smallbucket.cc
new file mode 100644
index 0000000000000..1dbc19b7136cd
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/smallbucket.cc
@@ -0,0 +1,138 @@
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+	b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	buckets[h].push_back(b);
+	return b;
+  } else {
+	// mixed
+	Bucket *b = new TreeBucket(h+1);
+	c.add_bucket(b);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	  n->set_parent(b->get_id());
+	}
+	buckets[h].push_back(b);
+	//cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+  return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, vector<int>& ocount)
+{
+  vector<int> v(numrep);
+  //map<int,int> ocount;
+
+  for (int x=1; x<=numpg; x++) {
+	
+	//cout << H(x) << "\t" << h(x) << endl;
+	c.do_rule(rule, x, v);
+	//cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+	
+	bool bad = false;
+	for (int i=0; i<numrep; i++) {
+	  //int d = b.choose_r(x, i, h);
+	  //v[i] = d;
+	  ocount[v[i]]++;
+	  for (int j=i+1; j<numrep; j++) {
+		if (v[i] == v[j]) 
+		  bad = true;
+	  }
+	}
+	if (bad)
+	  cout << "bad set " << x << ": " << v << endl;
+	
+	//placement[x] = v;
+
+	//cout << v << "\t" << ocount << endl;
+  }
+  
+
+}
+
+
+int main()//float testmovement(int depth, int branching, int udisks)
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+  
+  vector<int> wid;
+  wid.push_back(10);
+  wid.push_back(2);
+
+  map< int, list<Bucket*> > buckets;
+  root = make_hierarchy(c, wid, buckets, ndisks);
+
+  // add small bucket
+  vector<int> disks;
+  for (int i=0; i<3; i++)
+	disks.push_back(ndisks++);
+  UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+  b->make_primes(h);
+  Bucket *o = buckets[1].back();
+  c.add_bucket(b);
+  //cout << " adding under " << o->get_id() << endl;
+  c.add_item(o->get_id(), b->get_id(), b->get_weight());
+  
+
+  // rule
+  int numrep = 6;
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+  //c.overload[10] = .1;
+
+  int pg_per = 10000;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+
+  c.print(cout, root);
+
+  place(c, rule, numpg, numrep, ocount);
+  
+  for (int i=0; i<ocount.size(); i++) {
+	cout << "disk " << i << " = " << ocount[i] << endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/branches/sage/cephmds2/crush/test/speed_bucket.cc b/branches/sage/cephmds2/crush/test/speed_bucket.cc
new file mode 100644
index 0000000000000..973379f945377
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/speed_bucket.cc
@@ -0,0 +1,86 @@
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int numrep = 1;
+
+
+double go(int n, int bucket) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  Bucket *b;
+  vector<int> items;
+  if (bucket == 0) b = new UniformBucket(1,0,10,items);
+  if (bucket == 1) b = new TreeBucket(1);
+  if (bucket == 2) b = new ListBucket(1);
+  if (bucket == 3) b = new StrawBucket(1);
+
+  for (int d=0; d<n; d++)
+	b->add_item(ndisks++, 1);
+
+  //if (!bucket)	((UniformBucket*)b)->make_primes(h);
+
+  root = c.add_bucket(b);
+
+  // rule
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+  int place = 1000000;
+
+
+  vector<int> v(numrep);
+  set<int> out;
+  map<int,float> overload;
+
+  utime_t start = g_clock.now();
+
+  for (int x=1; x <= place; x++)
+	c.do_rule(rule, x, v, out, overload);
+
+  utime_t end = g_clock.now();
+
+  end -= start;
+  double el = (double)end;
+
+  //cout << "\t" << ndisks;
+
+  return el;
+}
+
+
+int main() 
+{
+
+  for (int n=4; n<=50; n += 4) {
+	cout << n;
+	for (int b=0; b<4; b++) {
+	  double el = go(n,b);
+	  cout << "\t" << el;
+	}
+	cout << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/speed_depth.cc b/branches/sage/cephmds2/crush/test/speed_depth.cc
new file mode 100644
index 0000000000000..32275d16d2b31
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/speed_depth.cc
@@ -0,0 +1,174 @@
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int uniform = 10;
+int branching = 10;
+int buckettype = 0;
+int numrep = 1;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b;
+	if (buckettype == 0)
+	  b = new TreeBucket(h+1);
+	else if (buckettype == 1 || buckettype == 2)
+	  b = new ListBucket(h+1);
+	else if (buckettype == 3)
+	  b = new StrawBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+double go(int dep, int per) 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  if (1) {
+	wid.push_back(uniform);
+	for (int d=1; d<dep; d++)
+	  wid.push_back(per);
+  }
+  if (0) {
+	if (dep == 0) 
+	  wid.push_back(1000);
+	if (dep == 1) {
+	  wid.push_back(1);
+	  wid.push_back(1000);
+	}
+	if (dep == 2) {
+	  wid.push_back(5);
+	  wid.push_back(5);
+	  wid.push_back(8);
+	  wid.push_back(5);
+	}	
+  }
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  
+
+
+  // rule
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+  int place = 1000000;
+
+
+  vector<int> v(numrep);
+
+  utime_t start = g_clock.now();
+
+  set<int> out;
+  map<int,float> overload;
+
+  for (int x=1; x <= place; x++)
+	c.do_rule(rule, x, v, out, overload);
+
+  utime_t end = g_clock.now();
+
+  end -= start;
+  double el = (double)end;
+
+  //cout << "\t" << ndisks;
+
+  return el;
+}
+
+
+int main() 
+{
+  uniform = branching = 8;
+
+  cout << "// dep\tuniform\tbranch\tndisks" << endl;
+
+  for (int d=2; d<=5; d++) {
+	cout << d;// << "\t" << branching;
+	cout << "\t" << uniform;
+	cout << "\t" << branching;
+
+	int n = 1;
+	for (int i=0; i<d; i++)
+	  n *= branching;
+	cout << "\t" << n;
+
+	numrep = 2;
+
+	// crush
+	for (buckettype = 0; buckettype <= 3; buckettype++) {
+	  switch (buckettype) {
+	  case 0: cout << "\ttree"; break;
+	  case 1: cout << "\tlist"; break;
+	  case 2: continue;
+	  case 3: cout << "\tstraw"; break;
+	  }
+
+	  //for (numrep = 1; numrep <= 3; numrep++) {
+	  //cout << "\t" << numrep;
+	  
+	  double el = go(d, branching);
+	  cout << "\t" << el;
+	}
+
+	// rush
+
+	buckettype = 0;
+	cout << "\trush_T\t" << go(2, n/uniform);
+
+	buckettype = 1;
+	cout << "\trush_P\t" << go(2, n/uniform);
+
+	cout << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/speed_rush.cc b/branches/sage/cephmds2/crush/test/speed_rush.cc
new file mode 100644
index 0000000000000..93a5584a2680a
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/speed_rush.cc
@@ -0,0 +1,145 @@
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int branching = 10;
+bool linear = false;
+int numrep = 1;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+  if (h == 0) {
+	// uniform
+	Hash hash(123);
+	vector<int> disks;
+	for (int i=0; i<wid[h]; i++)
+	  disks.push_back(ndisks++);
+	UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+	//b->make_primes(hash);  
+	c.add_bucket(b);
+	//cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+	return b;
+  } else {
+	// mixed
+	Bucket *b;
+	if (linear)
+	  b = new ListBucket(h+1);
+	else
+	  b = new TreeBucket(h+1);
+	for (int i=0; i<wid[h]; i++) {
+	  Bucket *n = make_bucket(c, wid, h-1, ndisks);
+	  b->add_item(n->get_id(), n->get_weight());
+	}
+	c.add_bucket(b);
+	//cout << h << " mixedbucket with " << wid[h] << endl;
+	return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+  return b->get_id();
+}
+
+
+double go(int s) 
+{
+  int dep = 2;
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  int root = -1;
+  int ndisks = 0;
+
+  vector<int> wid;
+  if (1) {
+	//for (int d=0; d<dep; d++)
+	wid.push_back(8);
+	wid.push_back(s/8);
+  }
+  if (0) {
+	if (dep == 0) 
+	  wid.push_back(1000);
+	if (dep == 1) {
+	  wid.push_back(1);
+	  wid.push_back(1000);
+	}
+	if (dep == 2) {
+	  wid.push_back(5);
+	  wid.push_back(5);
+	  wid.push_back(8);
+	  wid.push_back(5);
+	}	
+  }
+
+  if (1) {
+	root = make_hierarchy(c, wid, ndisks);
+  }
+  
+
+
+  // rule
+  Rule rule;
+  rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+  int place = 1000000;
+
+
+  vector<int> v(numrep);
+
+  utime_t start = g_clock.now();
+
+  for (int x=1; x <= place; x++)
+	c.do_rule(rule, x, v);
+
+  utime_t end = g_clock.now();
+
+  end -= start;
+  double el = (double)end;
+
+  cout << "\t" << ndisks;
+
+  return el;
+}
+
+
+int main() 
+{
+  branching = 8;
+
+  int d = 2;
+  numrep = 2;
+
+  for (int s = 64; s <= 32768; s *= 8) {
+	cout << "t";
+	linear = false;
+	double el = go(s, d);
+	cout << "\t" << el;
+
+	cout << "\tp";
+	linear = true;
+	el = go(s, d);
+	cout << "\t" << el;
+
+	cout << endl;
+  }
+}
diff --git a/branches/sage/cephmds2/crush/test/t.cc b/branches/sage/cephmds2/crush/test/t.cc
new file mode 100644
index 0000000000000..0785ef47d6c04
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/t.cc
@@ -0,0 +1,25 @@
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int branching = 10;
+bool linear = false;
+int numrep = 1;
+
+int main() {
+
+  Bucket *b = new UniformBucket(1, 0);
+  //b = new TreeBucket(1);
+}
+
diff --git a/branches/sage/cephmds2/crush/test/testbucket.cc b/branches/sage/cephmds2/crush/test/testbucket.cc
new file mode 100644
index 0000000000000..065721c2c1967
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/testbucket.cc
@@ -0,0 +1,61 @@
+
+
+#include "../Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+  out << "[";
+  for (int i=0; i<v.size(); i++) {
+	if (i) out << " ";
+	out << v[i];
+  }
+  out << "]";
+  return out;
+}
+
+
+int main() 
+{
+  Hash h(73);
+
+  int ndisks = 0;
+  int numrep = 3;
+
+  StrawBucket mb(1);
+  /*for (int i=0;i<10;i++)
+	mb.add_item(ndisks++, 10);
+  */
+  mb.add_item(ndisks++, 1);
+  mb.add_item(ndisks++, 1);
+  mb.add_item(ndisks++, 10);
+  mb.add_item(ndisks++, 10);
+  mb.add_item(ndisks++, 100);
+  mb.add_item(ndisks++, 1000);
+
+  vector<int> ocount(ndisks);
+
+  vector<int> v(numrep);
+  int nplace = 0;
+  for (int x=1; x<1000000; x++) {
+	//cout << H(x) << "\t" << h(x) << endl;
+	for (int i=0; i<numrep; i++) {
+	  int d = mb.choose_r(x, i, h);
+	  v[i] = d;
+	  ocount[d]++;
+	  nplace++;
+	}
+	//cout << v << "\t" << endl;//ocount << endl;
+  }
+
+  for (int i=0; i<ocount.size(); i++) {
+	float f = ocount[i] / (float)nplace;
+	cout << "disk " << i << " has " << ocount[i] << "  " << f << endl;
+  }
+
+}
diff --git a/branches/sage/cephmds2/crush/test/testnormal.cc b/branches/sage/cephmds2/crush/test/testnormal.cc
new file mode 100644
index 0000000000000..17c8cbf15b1b4
--- /dev/null
+++ b/branches/sage/cephmds2/crush/test/testnormal.cc
@@ -0,0 +1,51 @@
+
+#include <vector>
+#include <iostream>
+using namespace std;
+
+
+void getdist(vector<int>& v, float& avg, float& var) 
+{
+  avg = 0.0;
+  for (int i=0; i<v.size(); i++)
+	avg += v[i];
+  avg /= v.size();
+  
+  var = 0.0;
+  for (int i=0; i<v.size(); i++)
+	var += (v[i] - avg) * (v[i] - avg);
+  var /= v.size();
+}
+
+int main() 
+{
+  int n = 50;
+  vector<int> a(n);
+  vector<int> b(n);
+
+  for (int i=0; i<n*n; i++)
+	a[rand()%n]++;
+
+  float aavg, avar;
+  getdist(a, aavg, avar);
+
+  for (int i=0; i<7*n*n; i++)
+	b[rand()%n]++;
+
+  float bavg, bvar;
+  getdist(b, bavg, bvar);
+
+  cout << "a avg " << aavg << " var " << avar << endl;
+  cout << "b avg " << bavg << " var " << bvar << endl;
+
+
+  vector<int> c(n);
+  for (int i=0; i<n; i++)
+	c[i] = a[i] * b[i];
+
+  float cavg, cvar;
+  getdist(c, cavg, cvar);
+
+  cout << "c avg " << cavg << " var " << cvar << endl;
+	
+}
diff --git a/branches/sage/cephmds2/doc/Commitdir.txt b/branches/sage/cephmds2/doc/Commitdir.txt
new file mode 100644
index 0000000000000..83c89bdcaef4a
--- /dev/null
+++ b/branches/sage/cephmds2/doc/Commitdir.txt
@@ -0,0 +1,22 @@
+
+How Directory Committing Works:
+
+Each CDir has: 
+	 version - current version of directory
+	 committing_version - which version was sent to stable storage
+	 last_committed_version - last version to be safely stored
+
+Each Inode has: 
+	 parent_dir_version - what dir version i was in when i was dirtied.  (*)
+
+	 (*) note that if you change an inode, mark_dirty() again, even if it's already dirty!
+
+
+How committing works:
+
+A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished.
+
+When a commit completes, inodes in the directory are checked.  If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean.  If they belong to a newer version, then they are _still dirty_.
+
+
+
diff --git a/branches/sage/cephmds2/doc/Replication.txt b/branches/sage/cephmds2/doc/Replication.txt
new file mode 100644
index 0000000000000..0f8d4c9079e4d
--- /dev/null
+++ b/branches/sage/cephmds2/doc/Replication.txt
@@ -0,0 +1,19 @@
+
+Primary copy replication.
+
+Inodes:
+
+- The primary's list of replicas (cached_by) is inclusive at all times.
+- The primary's list never includes the local node.
+- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight.
+
+- Replicas can be created in two ways:
+  - via a Discover + DiscoverReply
+  - via an export and import.  (The old auth keeps a copy, and adds itself to the replica list as it exports.)
+
+
+Directories (and their dentries):
+
+- The primary has an open_by list that is inclusive at all times.
+- ..Never includes local node
+- No per-dentry replica lists.  All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list.
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/caching.txt b/branches/sage/cephmds2/doc/caching.txt
new file mode 100644
index 0000000000000..77b02480bcd6e
--- /dev/null
+++ b/branches/sage/cephmds2/doc/caching.txt
@@ -0,0 +1,200 @@
+
+
+AUTHORITY
+
+The authority maintains a list of what nodes cache each inode.
+Additionally, each replica is assigned a serial (normally 0) to
+disambiguate multiple replicas of the same item (see below).
+
+  set<int> cached_by;
+  map<int, int> cached_by_serial;
+
+The cached_by set _always_ includes all nodes that cache the
+partcuarly inode, but may additionally include nodes that used to
+cache it but no longer do.  In those cases, an expire message should
+be in transit.
+
+
+REPLICA
+
+The replica maintains a notion of who it believes is the authority for
+each replicated inode.  There are two possibilities:
+
+ - Ordinarily, this notion is correct.  
+ - If the part of the file system in question was recently exported to
+   a new MDS, the inodes old authority is acting as a CACHEPROXY,
+   and will forward relevant messages on to the authority.
+
+When a repica is expired from cache, and expire is sent to the
+authority.  The expire includes the serial number issued when the
+replica was originally created to disambiguate potentially concurrent
+replication activity.
+
+
+EXPORTS 
+
+- The old authority suddenly becomes a replica.  It's serial is well
+  defined.  It also becomes a CACHEPROXY, which means its cached_by
+  remains defined (with an alternate meaning!).  While a proxy, the
+  node will forward relevant messages from the replica to the
+  authority (but not the other way around--the authority knows all
+  replicas).  
+
+- Once the export is acked, the old authority sends a
+  message to the replica notifying it of the new authority.  As soon
+  as all replicas acknowedge receipt of this notice, the old authority
+  can cease CACHEPROXY responsibilities and become a regular replica.
+  At this point it's cached_by is no longer defined.
+
+- Replicas always know who the authority for the inode is, OR they
+  know prior owner acting as a CACHEPROXY.  (They don't know which it
+  is.)
+
+
+CACHED_BY
+
+The authority always has an inclusive list of nodes who cache an item.
+As such it can confidently send updates to replicas for locking,
+invalidating, etc.  When a replica is expired from cache, an expire is
+sent to the authority.  If the serial matches, the node is removed
+from the cached_by list.
+
+
+
+
+
+SUBTREE AUTHORITY DELEGATION: imports versus hashing
+
+Authority is generally defined recursively: an inode's authority
+matches the containing directory, and a directory's authority matches
+the directory inode's.  Thus the authority delegation chain can be
+broken/redefined in two ways:
+
+ - Imports and exports redefine the directory inode -> directory
+   linkage, such that the directory authority is explicitly specified
+   via dir.dir_auth:
+
+      dir.dir_auth == -1  -> directory matches its inode
+      dir.dir_auth >= 0   -> directory authority is dir.dir_auth
+
+ - Hashed directories redefine the directory -> inode linkage.  In
+   non-hashed directories, inodes match their containing directory.
+   In hashed directories, each dentry's authority is defined by a hash
+   function.
+
+      inode.hash_seed == 0  -> inode matches containing directory
+      inode.hash_seed >  0  -> defined by hash(hash_seed, dentry)
+
+A directory's "containing_import" (bad name, FIXME) is either the
+import or hashed directory that is responsible for delegating a
+subtree.  Note that the containing_import of a directory may be itself
+because it is an import, but it cannot be itself because it is hashed.
+
+Thus:
+
+ - Import and export operations' manipulation of dir_auth is
+   completely orthogonal to hashing operations.  Hashing methods can
+   ignore dir_auth, except when they create imports/exports (and break
+   the inode<->dir auth linkage).
+
+ - Hashdirs act sort of like imports in that they bound an
+   authoritative region.  That is, either hashdirs or imports can be
+   the key for nested_exports.  In some cases, a dir may be both an
+   import and a hash.
+
+ - Export_dir won't export a hashdir.  This is because it's tricky
+   (tho not necessarily impossible) due to the way nested_exports is
+   used with imports versus hashdirs.
+
+
+
+
+FREEZING
+
+There are two types of freezing:
+
+ - TREE: recursively freezes everything nested beneath a directory,
+   until an export of edge of cache is reached.  
+ - DIR: freezes the contents of a single directory.
+
+Some notes:
+
+ - Occurs on the authoritative node only.
+
+ - Used for suspending critical operations while migrating authority
+   between nodes or hashing/unhashing directories.
+
+ - Freezes the contents of the cache such that items may not be added,
+   items cannot be auth pinned, and/or subsequently reexported.  The
+   namespace of the affected portions of the hierarchy may not change.
+   The content of inodes and other orthogonal operations
+   (e.g. replication, inode locking and modification) are unaffected.
+
+Two states are defined: freezing and frozen.  The freezing state is
+used while waiting for auth_pins to be removed.  Once all auth_pins
+are gone, the state is changed to frozen.  New auth_pins cannot be
+added while freezing or frozen.
+
+
+AUTH PINS
+
+An auth pin keeps a given item on the authoritative node until it is
+removed.  The pins are tracked recursively, so that a subtree cannot
+be frozen if it contains any auth pins.
+
+If a pin is placed on a non-authoritative item, the item is allowed to
+become authoritative; the specific restriction is it cannot be frozen,
+which only happens during export-type operations.
+
+
+TYPES OF EXPORTS
+
+- Actual export of a subtree from one node to another
+- A rename between directories on different nodes exports the renamed
+_inode_.  (If it is a directory, it becomes an export such that the
+directory itself does not move.)
+- A hash or unhash operation will migrate inodes within the directory
+either to or from the directory's main authority.
+
+EXPORT PROCESS
+
+
+
+
+HASHING
+
+- All nodes discover and open directory
+
+- Prep message distributes subdir inode replicas for exports so that
+  peers can open those dirs.  This is necessary because subdirs are
+  converted into exports or imports as needed to avoid migrating
+  anything except the hashed dir itself.  The prep is needed for the
+  same reasons its important with exports: the inode authority must
+  always have the exported dir open so that it gets accurate dir
+  authority updates, and can keep the inode->dir_auth up to date.
+
+- MHashDir messsage distributes the directory contents.
+
+- While auth is frozen_dir, we can't get_or_open_dir.  Otherwise the
+  Prep messages won't be inclusive of all dirs, and the
+  imports/exports won't get set up properly.
+
+TODO
+readdir
+
+
+- subtrees stop at hashed dir.  hashed dir's dir_auth follows parent
+  subtree, unless the dir is also an explicit import.  thus a hashed
+  dir can also be an import dir.  
+
+
+bananas
+apples
+blueberries
+green pepper
+carrots
+celery
+
+
+
+
diff --git a/branches/sage/cephmds2/doc/dentries.txt b/branches/sage/cephmds2/doc/dentries.txt
new file mode 100644
index 0000000000000..ab14765998b2f
--- /dev/null
+++ b/branches/sage/cephmds2/doc/dentries.txt
@@ -0,0 +1,4 @@
+
+null dentires only exist
+  - on auth
+  - on replica, if they are xlock
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/file_modes.txt b/branches/sage/cephmds2/doc/file_modes.txt
new file mode 100644
index 0000000000000..d4ceba4034e5f
--- /dev/null
+++ b/branches/sage/cephmds2/doc/file_modes.txt
@@ -0,0 +1,66 @@
+
+underlying client capabilities:
+
+- read + cache
+- read sync
+- write sync
+- write + buffer
+  (...potentially eventually augmented by byte ranges)
+
+whatever system of modes, tokens, etc. has to satisfy the basic
+constraint that no conflicting capabilities are ever in the 
+hands of clients.
+
+
+questions:
+- is there any use to clients writing to a replica?
+  - reading, yes.. 100,000 open same file..
+
+
+------
+
+simplest approach:
+- all readers, writers go through authority
+- all open, close traffic at replicas forwarded to auth
+
+- fh state migrates with exports.
+
+
+
+--------
+
+less simple:
+- all writers go through authority
+  - open, close traffic fw
+- readers from any replica
+  - need token from auth
+- weird auth <-> replica <-> client interactions ensue!
+
+
+--------
+
+even more complex (and totally FLAWED, ignore this!)
+
+- clients can open a file with any replica (for read or write).
+- replica gets a read or write token from the primary
+  - primary thus knows if it's all read, all write, mixed, or none.
+- once replica has a token it can service as many clients (of given type(s)) as it wants.
+- on export, tokens are moved too.
+  - primary give _itself_ a token too!  much simpler.
+
+- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock
+- globally, the mode is controlled by the primary, based on the mixture of 
+  read and write tokens issued
+
+
+
+- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can
+  request to read or write from the mds (which might twiddle the mode for performance
+  reasons.. e.g. lots of ppl rdwr but no actual reading)
+
+
+
+
+--------
+
+
diff --git a/branches/sage/cephmds2/doc/header.txt b/branches/sage/cephmds2/doc/header.txt
new file mode 100644
index 0000000000000..8a3c51280461d
--- /dev/null
+++ b/branches/sage/cephmds2/doc/header.txt
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:4; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
diff --git a/branches/sage/cephmds2/doc/inos.txt b/branches/sage/cephmds2/doc/inos.txt
new file mode 100644
index 0000000000000..b5ab1db25ca60
--- /dev/null
+++ b/branches/sage/cephmds2/doc/inos.txt
@@ -0,0 +1,11 @@
+
+inodeno_t namespace
+ - relevant both for ino's, and for the (ino) input for Filer and object storage namespace...
+
+1       - root inode
+
+100+mds - mds log/journal
+200+mds - mds ino, fh allocation tables
+300+mds - mds inode files (for non-embedded inodes)
+
+1000+   - regular files and directories
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/journal.txt b/branches/sage/cephmds2/doc/journal.txt
new file mode 100644
index 0000000000000..12d66f86f00f4
--- /dev/null
+++ b/branches/sage/cephmds2/doc/journal.txt
@@ -0,0 +1,108 @@
+
+
+journal is distributed among different nodes.  because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal).
+
+
+possibilities:
+
+
+ONE.. bloat the journal!
+
+- journal entry includes full trace of dirty data (dentries, inodes) up until import point
+  - local renames implicit.. cache is reattached on replay
+  - exports are a list of exported dirs.. which are then dumped
+    ...   
+
+recovery phase 1 
+- each entry includes full trace (inodes + dentries) up until the import point
+- cache during recovery is fragmetned/dangling beneath import points
+- when export is encountered items are discarded (marked clean)
+
+recovery phase 2
+- import roots ping store to determine attachment points (if not already known)
+  - if it was imported during period, attachment point is already known.  
+  - renames affecting imports are logged too
+- import roots discovered from other nodes, attached to hierarchy
+
+then
+- maybe resume normal operations
+- if recovery is a background process on a takeover mds, "export" everything to that node.
+
+
+-> journal contains lots of clean data.. maybe 5+ times bigger as a result!
+
+possible fixes:
+  - collect dir traces into journal chunks so they aren't repeated as often
+    - each chunk summarizes traces in previous chunk
+    - hopefully next chunk will include many of the same traces
+    - if not, then the entry will include it
+
+
+
+
+=== log entry types ===
+- all inode, dentry, dir items include a dirty flag.
+- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm
+
+ImportPath  - log change in import path
+Import      - log import addition (w/ path, dirino)
+
+InoAlloc    - allocate ino
+InoRelease  - release ino
+
+Inode       - inode info, along with dentry+inode trace up to import point
+Unlink      - (null) dentry + trace, + flag (whether inode/dir is destroyed)
+Link        - (new) dentry + inode + trace
+
+
+-----------------------------
+
+TWO.. 
+- directories in store contain path at time of commit (relative to import, and root)
+- replay without attaching anything to heirarchy
+- after replay, directories pinged in store to attach to hierarchy
+
+-> phase 2 too slow!
+-> and nested dirs may reattach... that won't be apparent from journal.
+  - put just parent dir+dentry in dir store.. even worse on phase 2!
+
+
+THREE
+- 
+
+
+
+
+
+
+
+metadata journal/log
+
+
+event types:
+
+chown, chmod, utime
+  InodeUpdate
+
+mknod, mkdir, symlink
+  Mknod  .. new inode + link
+
+unlink, rmdir
+  Unlink
+
+rename
+   Link + Unlink  (foreign)
+or Rename         (local)
+
+link
+  Link   .. link existing inode 
+
+
+
+
+InodeUpdate
+DentryLink
+DentryUnlink
+InodeCreate
+InodeDestroy
+Mkdir?
diff --git a/branches/sage/cephmds2/doc/lazy_posix.txt b/branches/sage/cephmds2/doc/lazy_posix.txt
new file mode 100644
index 0000000000000..1d226cd03d8e4
--- /dev/null
+++ b/branches/sage/cephmds2/doc/lazy_posix.txt
@@ -0,0 +1,53 @@
+
+http://www.usenix.org/events/fast05/wips/slides/welch.pdf
+
+
+
+-- STATLITE
+  statlite(const char *filename, struct statlite *buf);
+  fstatlite(int fd, struct statlite *buf);
+  lstatlite(const char *filename, struct statlite *buf);
+
+  * file size, mtime are optionally not guaranteed to be correct
+  * mask field to specify which fields you need to be correct
+
+
+-- READDIR+
+
+  struct dirent_plus *readdirplus(DIR *dirp);
+  int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result);
+  struct dirent_lite *readdirlite(DIR *dirp);
+  int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result);
+
+  * plus returns lstat
+  * lite returns lstatlite
+
+
+-- lazy i/o integrity
+
+  O_LAZY to open(2)
+
+  * relax data coherency
+  * writes may not be visible until lazyio_propagate, fsync, close
+
+  lazyio_propagate(int fd, off_t offset, size_t count);
+   * my writes are safe
+
+  lazyio_synchronize(int fd, off_t offset, size_t count);
+   * i will see everyone else's propagated writes
+
+-- read/write non-serial vectors
+
+  ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count);
+  ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count);
+
+ * like readv/writev, but serial
+ * 
+
+
+int lockg(int fd, int cmd, lgid_t *lgid)
+   group locks
+
+int openg(char *path, int mode, fh_t *handle);
+   portable file handle
+int sutoc(fh_t *fh);
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/osd_outline.txt b/branches/sage/cephmds2/doc/osd_outline.txt
new file mode 100644
index 0000000000000..2c6f3287aac5f
--- /dev/null
+++ b/branches/sage/cephmds2/doc/osd_outline.txt
@@ -0,0 +1,37 @@
+
+intro
+
+osd cluster map
+ requirements
+ desireable properties
+ (c)rush
+
+failure detection
+ distributed ping or heartbeat
+ central filter, notifier
+
+design
+ placement seed, class/superset, groups
+
+normal operation
+ reads
+ writes
+
+recovery
+ triggers: failed disk, or total cluster reorganization
+
+ notify
+ peering
+ pull
+ push
+ clean
+
+writes during recovery
+
+graceful data loss + recovery?
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/doc/osd_replication.txt b/branches/sage/cephmds2/doc/osd_replication.txt
new file mode 100644
index 0000000000000..907d00e2050a2
--- /dev/null
+++ b/branches/sage/cephmds2/doc/osd_replication.txt
@@ -0,0 +1,226 @@
+
+
+SOME GENERAL REQUIREMENTS
+
+- cluster expansion: 
+  - any or all of the replicas may move to new OSDs.
+
+- cluster map may change frequently
+  - map change should translate into pending replication/migration
+    state quickly (or better yet, instantly), so that we could push
+    through a series of (say, botched) maps quickly and be fine, so long
+    as the final map is correct.
+
+- ideally, unordered osd<->osd, client<->osd communication
+  (mds<->mds, client<->mds communication is ordered, but file i/o
+  would be too slow that way?) 
+
+
+
+
+PRIMARY ONLY PICTURE
+
+let's completely ignore replication for a while, and see how
+complicated the picture needs to be to reliably support cluster expansion.
+
+typedef __uint64_t version_t;
+
+
+per-Object metadata:
+- version #.  incremented when an object is modified.
+   e.g. version_t version;
+- on primary, keep list of stray replicas
+   e.g. map<int,version_t> stray_replicas;  // osds w/ stray replicas
+  includes old primary osd(s), until deletion is confirmed.  used while rg
+  is importing.
+
+
+per-RG metadata
+- object list.  well, a method to fetch it by querying a collection or whatever.
+- negative <object,version> list
+   e.g. map<object_t, version_t> deleted_objects;
+  - used to enumerate deleted objects, when in "importing" state.
+- a RG "state" (enum/int)
+
+
+
+
+
+
+Normal RG state:
+- role=primary 
+    clean        - i am primary, all is well.  no stray copies.  i can
+                   discard my negative object list, since my local
+				   object store tells me everything.
+
+
+After a map change:
+- new primary
+    undef        - initially; i don't know RG exists.
+- old primary
+    homeless     - i was primary, still have unmolested data.  new primary is not yet migrating 
+                   (presumably it's state=undef.)  i need to contact new primary and tell them 
+                   this RG exists.
+
+- new primary
+    importing    - i am migrating data from old primary.  keep negative dir entries for deletions.
+                   write locally.  proxy reads (force immediately migration).  do whole objects 
+                   initially (on write, block until i migrate the object).  later we can do 
+                   sub-object state (where "live" object data is spread across new/old primaries..
+- old primary
+    exporting    - primary is migrating my data.
+    undef        - when it finishes.  (i will forget this RG existed.)
+
+
+After a second map change (scenario 1):  
+ as above, if we were clean again.
+
+After a second map change (scenario 2): 
+ we weren't clean yet.
+- new primary
+    undef        - initially (until i learn RG exists)
+- old primary
+    importing    - i'm still migrating from old old primary
+- old old primary
+    exporting    - ...
+- old primary
+??  importing+exporting - proxy reads as before.  continue migrating from old old primary.
+
+
+After a second map change (scenario 3): 
+ we weren't clean yet, and old old primary is also new primary
+- new primary (and old old primary)
+    exporting    - change state to importing.  be sure to compare object versions, and neg dir 
+                   entries (as we always should do, really!).
+- old primary
+    importing    - note that the old import source matches new primary, and change
+                   state to exporting, and stop importing. (unlike scenario 2)
+
+-> this approach could mean that a series of fast map changes could
+   force data to migrate down a "chain" of old primaries to reach the
+   new one.  maybe old primary should go from importing -> exporting,
+   and pass along old old primary id to new primary such that the
+   import is a many-to-one thing, instead of one-to-one.  version
+   numbers and neg entries will make it easy to pick out correct versions.
+
+
+
+For the importing process on a given RG:
+
+- metadata for each source
+  - each source has a state:
+    'starting'  - don't know anything about source yet.  query source!
+                  this probaby induces the source to change from
+                  'homeless' or something similar to 'exporting'.
+    'importing' - i've fetched the source's object list (and neg
+                  object list).  i'm busy reading them!  these lists
+                  will shrink as the process continues.  after i fetch
+                  an object, i will erase it from the source.
+                  (object metadata will include stray copy info
+                  until i confirm that its removed.)
+    'finishing' - i've read all my data, and i'm telling the old person
+                  to discard any remaining RG metadata (RG contents
+                  should already be gone)
+  - unmigrated object list
+  - migrated but not deleted object list  
+    - stray osd is also listed in per-object MD during this stage
+  - negative object list
+    - i can remove these items if i see a newer object version (say,
+      from a different import source or something).
+    - i can remove any local objects or ignore imported ones if it is 
+      older than deleted version
+
+- the lists should be sets or otherwise queryable so that while i'm
+  importing and a real op comes through I can quickly determine if a
+  given object_id is pending migration etc or if my local store is to 
+  be trusted.
+
+
+
+
+
+SOME CODE BITS
+
+
+typedef __uint64_t version_t;
+class Object {
+  version_t            version;
+  map<int, version_t>  stray_replicas;
+};
+
+
+class ReplicaGroup {
+  int enumerate_objects(list<object_t>& ls);
+  
+  int                       state;
+
+  // for unstable states,
+  map<object_t, version_t>  deleted_objects;  // locally
+  map<int, RGExporter_t>    exporters;        // importing from these guys.
+};
+
+// primary
+#define RG_STATE_CLEAN      1
+#define RG_STATE_IMPORTING  2  // pulling data
+
+// non-primary
+#define RG_STATE_HOMELESS   5  // old primary; new primary not yet
+                               // notified; not yet exporting.
+#define RG_STATE_EXPORTING  6  // a newer primary is extracting my
+                               // data.
+
+
+struct RGExporter_t {
+  int                      import_state;
+
+  set<object_t>            remaining_objects;  // remote object list
+  set<object_t>            stray_objects;      // imported but not deleted. 
+
+};
+
+
+
+
+
+----
+all crap from here on down
+
+
+
+
+REPLICAS
+- 
+
+
+
+
+OSD STATES
+- primary, up to date.
+- replica, up to date.
+
+- primary, proxy to old primary (primaries?)
+
+- replica, not up to date.
+
+
+REPLICATION STUFF
+
+Per-RG metadata
+- primary
+  - per-replica state: clean, catching up?
+- replica
+
+Per-object metadata
+- primary and replica
+  - version number/mtime
+  - rg (reverse indexed)
+- primary
+  - replication level and state.
+    - commited to memory and/or disk, on which replicas (#1, #2, etc.)
+- replica
+
+
+
+
+
+-> 
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/performance.txt b/branches/sage/cephmds2/doc/performance.txt
new file mode 100644
index 0000000000000..7ca278bd284b1
--- /dev/null
+++ b/branches/sage/cephmds2/doc/performance.txt
@@ -0,0 +1,36 @@
+
+
+quick performance test 2005-05-11.  fakemds, 100x100, asdf/asdf, debug 13
+ -g marshalling
+real    3m8.697s
+user    2m53.282s
+sys     0m6.291s
+
+real    3m3.337s
+user    2m49.467s
+sys     0m6.243s
+
+ -g no marshalling
+real    2m1.464s
+user    1m42.680s
+sys     0m8.128s
+
+real    1m49.469s
+user    1m34.523s
+sys     0m6.410s
+
+ -O3 marshalling
+real    1m29.833s
+user    1m11.474s
+sys     0m7.588s
+
+real    1m9.439s
+user    0m56.071s
+sys     0m5.643s
+
+
+ -O3 no marshalling
+real    1m2.739s
+user    0m46.578s
+sys     0m7.882s
+
diff --git a/branches/sage/cephmds2/doc/shared_write_states_nogo.txt b/branches/sage/cephmds2/doc/shared_write_states_nogo.txt
new file mode 100644
index 0000000000000..f409617d82681
--- /dev/null
+++ b/branches/sage/cephmds2/doc/shared_write_states_nogo.txt
@@ -0,0 +1,39 @@
+
+// stable states          // ------auth-----    -----replica-----
+#define LOCK_SYNC      0  // R . / .  . . WB    same                ... for stat()
+#define LOCK_LOCK      1  // R W / RC . . .     . . / RC . . .      ... for truncate(), fsync()
+#define LOCK_RDONLY    2  // R . / RC R . .     same
+#define LOCK_MIXED     3  // . . / .  R W .     same
+#define LOCK_WRONLY    4  // . . / .  . W WB    same
+
+// transition states
+#define LOCK_GSYNCR    8  // R . / RC . . .     same
+#define LOCK_GSYNCMW   9  // . . / RC . . WB    same
+#define LOCK_GSYNCMW2  9  // . . / RC . . WB    same
+
+#define LOCK_GLOCKSR   5  // R . / RC . . .     . . / RC . . .
+#define LOCK_GLOCKMW   7  // . . / RC . . .     same
+
+#define LOCK_GRDONLYM  10 // . . / .  R . .     same
+#define LOCK_GRDONLYM2 10 //      ---           . . / .  R . .     
+#define LOCK_GRDONLYW  11 // . . / .  . . .     same
+#define LOCK_GRDONLYW2 11 //      ---           . . / .  . . .     
+#define LOCK_GRDONLYS  12 // R . / RC . . .     same
+#define LOCK_GRDONLYL  13 // R . / RC . . .          ---
+
+#define LOCK_GMIXEDR   14 // R . / .  R . .     . . / .  R . .
+#define LOCK_GMIXEDR2  15 //      ---           . . / .  R . .
+#define LOCK_GMIXEDW   16 // . . / .  . W .     same
+#define LOCK_GMIXEDW2  16 //      ---           . . / .  . W .     
+#define LOCK_GMIXEDS   16 // R . / .  . . .     . . / .  . . .
+#define LOCK_GMIXEDS2  16 //      ---           . . / .  . . .     
+#define LOCK_GMIXEDL   17 // R . / .  . . .          --- 
+
+#define LOCK_GWRONLYR  18 // R . / .  . . .     same
+#define LOCK_GWRONLYR2 18 //      ---           . . / .  . . .
+#define LOCK_GWRONLYM  19 // . . / .  . . .     same
+#define LOCK_GWRONLYM2 19 //      ---           . . / .  . . .
+#define LOCK_GWRONLYS  20 // R . / .  . . WB    same
+#define LOCK_GWRONLYS2 20 //      ---           . . / .  . . .
+#define LOCK_GWRONLYL  21
+
diff --git a/branches/sage/cephmds2/doc/shutdown.txt b/branches/sage/cephmds2/doc/shutdown.txt
new file mode 100644
index 0000000000000..e5ccde3171004
--- /dev/null
+++ b/branches/sage/cephmds2/doc/shutdown.txt
@@ -0,0 +1,13 @@
+
+- mds0 triggers shutdown by sending a shutdown_start to all nodes.  
+
+- from here on out, all client requests are discarded (unless they are a file close?)
+
+- each mds checks for outstanding inter-mds transations.  e.g imports, discoveries, etc.  once they're all done, send a shutdown_ready to mds0
+
+- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty)
+
+- when the cache is empty, send shutdown_done to mds0 and exit.
+
+- mds0 exits when all mdss have finished.
+
diff --git a/branches/sage/cephmds2/ebofs/Allocator.cc b/branches/sage/cephmds2/ebofs/Allocator.cc
new file mode 100644
index 0000000000000..805957f779a11
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Allocator.cc
@@ -0,0 +1,692 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "Allocator.h"
+#include "Ebofs.h"
+
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator."
+
+
+void Allocator::dump_freelist()
+{
+  if (1) {
+    interval_set<block_t> free;     // validate too
+    
+    block_t n = 0;
+    for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+      Table<block_t,block_t> *tab;
+      if (b < EBOFS_NUM_FREE_BUCKETS) {
+        tab = fs->free_tab[b];
+        dout(0) << "dump bucket " << b << "  " << tab->get_num_keys() << endl;
+      } else {
+        tab = fs->limbo_tab;
+        dout(0) << "dump limbo  " << tab->get_num_keys() << endl;;
+      }
+
+      if (tab->get_num_keys() > 0) {
+        Table<block_t,block_t>::Cursor cursor(tab);
+        assert(tab->find(0, cursor) >= 0);
+        while (1) {
+          dout(0) << "dump  ex " << cursor.current().key << "~" << cursor.current().value << endl;
+          assert(cursor.current().value > 0);
+
+          if (b < EBOFS_NUM_FREE_BUCKETS)
+            n += cursor.current().value;
+
+          if (free.contains( cursor.current().key, cursor.current().value )) 
+            dout(0) << "dump   bad " << cursor.current().key << "~" << cursor.current().value << endl;
+          assert(!free.contains( cursor.current().key, cursor.current().value ));
+          free.insert( cursor.current().key, cursor.current().value );
+          if (cursor.move_right() <= 0) break;
+        }
+      } else {
+        //cout << "  empty" << endl;
+      }
+    }
+    
+    assert(n == fs->free_blocks);
+    dout(0) << "dump combined freelist is " << free << endl;
+
+    
+    // alloc_tab
+    if (fs->alloc_tab->get_num_keys() > 0) {
+      Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+      assert(fs->alloc_tab->find(0, cursor) >= 0);
+      while (1) {
+	dout(0) << "alloc  ex " << cursor.current().key << "~" << cursor.current().value.first << " ref "
+		<< cursor.current().value.second
+		<< endl;
+	assert(cursor.current().value.first > 0);
+	
+	if (cursor.move_right() <= 0) break;
+      }
+    }
+  }
+}
+
+
+int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir)
+{
+  Table<block_t,block_t>::Cursor cursor(fs->free_tab[bucket]);
+  bool found = false;
+
+  if ((dir == DIR_ANY || dir == DIR_FWD) && 
+      fs->free_tab[bucket]->find( near, cursor ) >= 0) {
+    // look to the right
+    do {
+      if (cursor.current().value >= num)
+        found = true;
+    } while (!found && cursor.move_right() > 0);
+  }
+
+  if ((dir == DIR_ANY || dir == DIR_BACK) && 
+      !found) {
+    // look to the left
+    fs->free_tab[bucket]->find( near, cursor );
+
+    while (!found && cursor.move_left() >= 0) 
+      if (cursor.current().value >= num)
+        found = true;
+  }
+
+  if (found) {
+    ex.start = cursor.current().key;
+    ex.length = cursor.current().value;
+    return 0;
+  }
+  
+  return -1;
+}
+
+int Allocator::allocate(Extent& ex, block_t num, block_t near)
+{
+  //dump_freelist();
+
+  int dir = DIR_ANY; // no dir
+  if (near == NEAR_LAST_FWD) {
+    near = last_pos;
+    dir = DIR_FWD;  // fwd
+  }
+  else if (near == NEAR_LAST)
+    near = last_pos;
+
+  int bucket;
+
+  while (1) {  // try twice, if fwd = true
+
+    // look for contiguous extent
+    for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) {
+      if (find(ex, bucket, num, near, dir) >= 0) {
+        // yay!
+        
+        // remove original
+        fs->free_tab[bucket]->remove( ex.start );
+        fs->free_blocks -= ex.length;
+        
+        if (ex.length > num) {
+          if (ex.start < near) {
+            // to the left
+            if (ex.start + ex.length - num <= near) {
+              // by a lot.  take right-most portion.
+              Extent left;
+              left.start = ex.start;
+              left.length = ex.length - num;
+              ex.start += left.length;
+              ex.length -= left.length;
+              assert(ex.length == num);
+              _release_loner(left);
+            } else {
+              // take middle part.
+              Extent left,right;
+              left.start = ex.start;
+              left.length = near - ex.start;
+              ex.start = near;
+              right.start = ex.start + num;
+              right.length = ex.length - left.length - num;
+              ex.length = num;
+              _release_loner(left);
+              _release_loner(right);
+            }
+          }
+          else {
+            // to the right.  take left-most part.
+            Extent right;
+            right.start = ex.start + num;
+            right.length = ex.length - num;
+            ex.length = num;
+            _release_loner(right);
+          }
+        }
+        
+        dout(20) << "allocate " << ex << " near " << near << endl;
+        last_pos = ex.end();
+        //dump_freelist();
+	if (g_conf.ebofs_cloneable)
+	  alloc_inc(ex);
+        return num;
+      }
+    }
+
+    if (dir == DIR_BACK || dir == DIR_ANY) break;
+    dir = DIR_BACK;
+  }
+
+  // ok, find partial extent instead.
+  for (block_t trysize = num/2; trysize >= 1; trysize /= 2) {
+    int bucket = pick_bucket(trysize);
+    if (find(ex, bucket, trysize, near) >= 0) {
+      // yay!
+      assert(ex.length < num);
+      
+      fs->free_tab[bucket]->remove(ex.start);
+      fs->free_blocks -= ex.length;
+      last_pos = ex.end();
+      dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl;
+      //dump_freelist();
+      if (g_conf.ebofs_cloneable)
+	alloc_inc(ex);
+      return ex.length;
+    }    
+  }
+
+  dout(1) << "allocate failed, fs completely full!  " << fs->free_blocks << endl;
+  assert(0);
+  //dump_freelist();
+  return -1;
+}
+
+int Allocator::_release_into_limbo(Extent& ex)
+{
+  dout(10) << "_release_into_limbo " << ex << endl;
+  dout(10) << "limbo is " << limbo << endl;
+  assert(ex.length > 0);
+  limbo.insert(ex.start, ex.length);
+  fs->limbo_blocks += ex.length;
+  return 0;
+}
+
+int Allocator::release(Extent& ex)
+{
+  if (g_conf.ebofs_cloneable)
+    return alloc_dec(ex);
+
+  _release_into_limbo(ex);
+  return 0;
+}
+
+int Allocator::commit_limbo()
+{
+  dout(20) << "commit_limbo" << endl;
+  for (map<block_t,block_t>::iterator i = limbo.m.begin();
+       i != limbo.m.end();
+       i++) {
+    fs->limbo_tab->insert(i->first, i->second);
+    //fs->free_blocks += i->second;
+  }
+  limbo.clear();
+  //fs->limbo_blocks = 0;
+  //dump_freelist();
+  return 0;
+}
+
+int Allocator::release_limbo()
+{
+  //dump_freelist();
+  if (fs->limbo_tab->get_num_keys() > 0) {
+    Table<block_t,block_t>::Cursor cursor(fs->limbo_tab);
+    fs->limbo_tab->find(0, cursor);
+    while (1) {
+      Extent ex(cursor.current().key, cursor.current().value);
+      dout(20) << "release_limbo  ex " << ex << endl;
+
+      fs->limbo_blocks -= ex.length;
+      _release_merge(ex);
+
+      if (cursor.move_right() <= 0) break;
+    }
+  }
+  fs->limbo_tab->clear();
+  //dump_freelist();
+  return 0;
+}
+
+
+
+/*
+int Allocator::_alloc_loner_inc(Extent& ex)
+{
+  Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+  
+  if (fs->alloc_tab->find( ex.start, cursor ) 
+      == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
+    assert(cursor.current().value.first == ex.length);
+    pair<block_t,int>& v = cursor.dirty_current_value();
+    v.second++;
+    dout(10) << "_alloc_loner_inc " << ex << " "
+             << (v.second-1) << " -> " << v.second 
+             << endl;
+  } else {
+    // insert it, @1
+    fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+    dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl;
+  }
+  return 0;
+}
+
+int Allocator::_alloc_loner_dec(Extent& ex)
+{
+  Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+  
+  if (fs->alloc_tab->find( ex.start, cursor ) 
+      == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
+    assert(cursor.current().value.first == ex.length);
+    if (cursor.current().value.second == 1) {
+      dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl;
+      fs->alloc_tab->remove( cursor.current().key );
+    } else {
+      pair<block_t,int>& v = cursor.dirty_current_value();
+      --v.second;
+      dout(10) << "_alloc_loner_dec " << ex << " "
+               << (v.second+1) << " -> " << v.second 
+               << endl;
+    }
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+*/
+
+
+int Allocator::alloc_inc(Extent ex)
+{
+  dout(10) << "alloc_inc " << ex << endl;
+
+  // empty table?
+  if (fs->alloc_tab->get_num_keys() == 0) {
+    // easy.
+    fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+    dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl;
+    return 0;
+  }
+
+  Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+  // try to move to left (to check for overlap)
+  int r = fs->alloc_tab->find( ex.start, cursor );
+  if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
+      cursor.current().key > ex.start) {
+    r = cursor.move_left();
+    dout(10) << "alloc_inc move_left r = " << r << endl;
+  }
+  
+  while (1) {
+    dout(10) << "alloc_inc loop at " << cursor.current().key 
+	     << "~" << cursor.current().value.first
+	     << " ref " << cursor.current().value.second
+	     << endl;
+
+    // too far left?
+    if (cursor.current().key < ex.start &&
+	cursor.current().key + cursor.current().value.first <= ex.start) {
+      // adjacent?
+      bool adjacent = false;
+      if (cursor.current().key + cursor.current().value.first == ex.start &&
+	  cursor.current().value.second == 1) 
+	adjacent = true;
+
+      // no overlap.
+      r = cursor.move_right();
+      dout(10) << "alloc_inc move_right r = " << r << endl;
+      
+      // at end?
+      if (r <= 0) {
+	// hmm!
+	if (adjacent) {
+	  // adjust previous entry
+	  cursor.move_left();
+	  pair<block_t,int> &v = cursor.dirty_current_value();
+	  v.first += ex.length; // yay!
+	  dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl;
+	} else {
+	  // insert at end, finish.
+	  int r = fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+	  dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl;
+	  //dump_freelist();
+	}
+	return 0;
+      }
+    }
+    
+    if (cursor.current().key > ex.start) {
+      // gap.
+      //    oooooo
+      //  nnnnn.....
+      block_t l = MIN(ex.length, cursor.current().key - ex.start);
+      
+      fs->alloc_tab->insert(ex.start, pair<block_t,int>(l,1));
+      dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl;
+      ex.start += l;
+      ex.length -= l;
+      if (ex.length == 0) break;
+      fs->alloc_tab->find( ex.start, cursor );
+    } 
+    else if (cursor.current().key < ex.start) {
+      block_t end = cursor.current().value.first + cursor.current().key;
+
+      if (end <= ex.end()) {
+	// single split
+	// oooooo
+	//    nnnnn
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.first = ex.start - cursor.current().key;
+	int ref = v.second;
+
+	block_t l = end - ex.start;
+	fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, 1+ref));
+	
+	dout(10) << "alloc_inc   " << ex.start << "~" << l 
+		 << " " << ref << " -> " << ref+1
+		 << " (right split)" << endl;
+	
+	ex.start += l;
+	ex.length -= l;
+	if (ex.length == 0) break;
+	fs->alloc_tab->find( ex.start, cursor );
+
+      } else {
+	// double split, finish.
+	// -------------
+	//    ------
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.first = ex.start - cursor.current().key;
+	int ref = v.second;
+	
+	fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, 1+ref));
+
+	int rl = end - ex.end();
+	fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
+
+	dout(10) << "alloc_inc   " << ex
+		 << " " << ref << " -> " << ref+1
+		 << " (double split finish)"
+		 << endl;
+
+	break;
+      }
+    } 
+    else {
+      assert(cursor.current().key == ex.start);
+      
+      if (cursor.current().value.first <= ex.length) {
+	// inc.
+	// oooooo
+	// nnnnnnnn
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.second++;
+	dout(10) << "alloc_inc   " << ex.start << "~" << cursor.current().value.first 
+		 << " " << cursor.current().value.second-1 << " -> "
+		 << cursor.current().value.second 
+		 << " (left split)" << endl;
+	ex.start += v.first;
+	ex.length -= v.first;
+	if (ex.length == 0) break;
+	cursor.move_right();
+      } else {
+	// single split, finish.
+	// oooooo
+	// nnn
+	block_t l = cursor.current().value.first - ex.length;
+	int ref = cursor.current().value.second;
+
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.first = ex.length;
+	v.second++;
+	
+	fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
+
+	dout(10) << "alloc_inc   " << ex
+		 << " " << ref << " -> " << ref+1
+		 << " (left split finish)"
+		 << endl;
+	
+	break;
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+int Allocator::alloc_dec(Extent ex)
+{
+  dout(10) << "alloc_dec " << ex << endl;
+
+  assert(fs->alloc_tab->get_num_keys() >= 0);
+  
+  Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+  // try to move to left (to check for overlap)
+  int r = fs->alloc_tab->find( ex.start, cursor );
+  dout(10) << "alloc_dec find r = " << r << endl;
+
+  if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
+      cursor.current().key > ex.start) {
+    r = cursor.move_left();
+    dout(10) << "alloc_dec move_left r = " << r << endl;
+
+    // too far left?
+    if (cursor.current().key < ex.start &&
+	cursor.current().key + cursor.current().value.first <= ex.start) {
+      // no overlap.
+      dump_freelist();
+      assert(0);
+    }
+  }
+
+  while (1) {
+    dout(10) << "alloc_dec ? " << cursor.current().key 
+	     << "~" << cursor.current().value.first
+	     << " " << cursor.current().value.second
+	     << ", ex is " << ex
+	     << endl;
+    
+    assert(cursor.current().key <= ex.start);  // no gap allowed.
+    
+    if (cursor.current().key < ex.start) {
+      block_t end = cursor.current().value.first + cursor.current().key;
+      
+      if (end <= ex.end()) {
+	// single split
+	// oooooo
+	//    -----
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.first = ex.start - cursor.current().key;
+	int ref = v.second;
+	dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first 
+		 << " " << ref
+		 << " shortened left bit of single" << endl;
+
+	block_t l = end - ex.start;
+	if (ref > 1) {
+	  fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, ref-1));
+	  dout(10) << "alloc_dec . " << ex.start << "~" << l 
+		   << " " << ref << " -> " << ref-1
+		   << endl;
+	} else {
+	  Extent r(ex.start, l);
+	  _release_into_limbo(r);
+	}
+		
+	ex.start += l;
+	ex.length -= l;
+	if (ex.length == 0) break;
+	fs->alloc_tab->find( ex.start, cursor );
+
+      } else {
+	// double split, finish.
+	// ooooooooooooo
+	//    ------
+	pair<block_t,int>& v = cursor.dirty_current_value();
+	v.first = ex.start - cursor.current().key;
+	int ref = v.second;
+	dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
+		 << " " << ref 
+		 << " shorted left bit of double split" << endl;
+
+	if (ref > 1) {
+	  fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, ref-1));
+	  dout(10) << "alloc_inc s " << ex
+		   << " " << ref << " -> " << ref-1
+		   << " reinserted middle bit of double split"
+		   << endl;
+	} else {
+	  _release_into_limbo(ex);
+	}
+
+	int rl = end - ex.end();
+	fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
+	dout(10) << "alloc_dec s " << ex.end() << "~" << rl
+		 << " " << ref 
+		 << " reinserted right bit of double split" << endl;
+	break;
+      }
+    } 
+    else {
+      assert(cursor.current().key == ex.start);
+      
+      if (cursor.current().value.first <= ex.length) {
+	// inc.
+	// oooooo
+	// nnnnnnnn
+	if (cursor.current().value.second > 1) {
+	  pair<block_t,int>& v = cursor.dirty_current_value();
+	  v.second--;
+	  dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first 
+		   << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second 
+		   << endl;
+	  ex.start += v.first;
+	  ex.length -= v.first;
+	  if (ex.length == 0) break;
+	  cursor.move_right();
+	} else {
+	  Extent r(cursor.current().key, cursor.current().value.first);
+	  _release_into_limbo(r);
+
+	  ex.start += cursor.current().value.first;
+	  ex.length -= cursor.current().value.first;
+	  cursor.remove();
+
+	  if (ex.length == 0) break;
+	  fs->alloc_tab->find( ex.start, cursor );
+	}
+      } else {
+	// single split, finish.
+	// oooooo
+	// nnn
+	block_t l = cursor.current().value.first - ex.length;
+	int ref = cursor.current().value.second;
+
+	if (ref > 1) {
+	  pair<block_t,int>& v = cursor.dirty_current_value();
+	  v.first = ex.length;
+	  v.second--;
+	  dout(10) << "alloc_inc . " << ex
+		   << " " << ref << " -> " << ref-1
+		   << endl;
+	} else {
+	  _release_into_limbo(ex);
+	  cursor.remove();
+	}
+	
+	dout(10) << "alloc_dec s " << ex.end() << "~" << l
+		 << " " << ref 
+		 << " reinserted right bit of single split" << endl;
+	fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
+	break;
+      }
+    }
+
+
+  }
+
+  return 0;
+}
+
+
+/*
+ * release extent into freelist
+ * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents
+ */
+int Allocator::_release_loner(Extent& ex) 
+{
+  assert(ex.length > 0);
+  int b = pick_bucket(ex.length);
+  fs->free_tab[b]->insert(ex.start, ex.length);
+  fs->free_blocks += ex.length;
+  return 0;
+}
+
+/*
+ * release extent into freelist
+ * look for any adjacent extents and merge with them!
+ */
+int Allocator::_release_merge(Extent& orig) 
+{
+  dout(15) << "_release_merge " << orig << endl;
+  assert(orig.length > 0);
+
+  Extent newex = orig;
+  
+  // one after us?
+  for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
+    Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
+    
+    if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) 
+        == Table<block_t,block_t>::Cursor::MATCH) {
+      // add following extent to ours
+      newex.length += cursor.current().value;
+      
+      // remove it
+      fs->free_blocks -= cursor.current().value;
+      fs->free_tab[b]->remove( cursor.current().key );
+      break;
+    }
+  }
+  
+  // one before us?
+  for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
+    Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
+    fs->free_tab[b]->find( newex.start+newex.length, cursor );
+    if (cursor.move_left() >= 0 &&
+        (cursor.current().key + cursor.current().value == newex.start)) {
+      // merge
+      newex.start = cursor.current().key;
+      newex.length += cursor.current().value;
+
+      // remove it
+      fs->free_blocks -= cursor.current().value;
+      fs->free_tab[b]->remove( cursor.current().key );
+      break;
+    }
+  }
+  
+  // ok, insert newex
+  _release_loner(newex);
+  return 0;
+}
diff --git a/branches/sage/cephmds2/ebofs/Allocator.h b/branches/sage/cephmds2/ebofs/Allocator.h
new file mode 100644
index 0000000000000..c53ff2a69fba1
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Allocator.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_ALLOCATOR_H
+#define __EBOFS_ALLOCATOR_H
+
+#include "types.h"
+
+#include "include/interval_set.h"
+
+class Ebofs;
+
+class Allocator {
+public:
+  const static block_t NEAR_LAST = 0;     
+  const static block_t NEAR_LAST_FWD = 1;   
+  
+  const static int DIR_ANY = 0;
+  const static int DIR_FWD = 2;
+  const static int DIR_BACK = 1;
+
+protected:
+  Ebofs *fs;
+  block_t      last_pos;
+  
+
+  interval_set<block_t> limbo;
+
+  static int pick_bucket(block_t num) {
+    int b = 0;
+    while (num > 1) {
+      b++;
+      num = num >> EBOFS_FREE_BUCKET_BITS;
+    }
+    if (b >= EBOFS_NUM_FREE_BUCKETS)
+      b = EBOFS_NUM_FREE_BUCKETS-1;
+    return b;
+  }
+
+  int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY);
+
+  void dump_freelist();
+
+ public:
+  int _release_into_limbo(Extent& ex);
+
+  int _release_loner(Extent& ex);  // release loner extent
+  int _release_merge(Extent& ex);  // release any extent (searches for adjacent)
+
+  //int _alloc_loner_inc(Extent& ex);
+  //int _alloc_loner_dec(Extent& ex);
+
+
+ public:
+  Allocator(Ebofs *f) : fs(f), last_pos(0) {}
+  
+  int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST);
+  int release(Extent& ex);  // alias for alloc_dec
+
+  int alloc_inc(Extent ex);
+  int alloc_dec(Extent ex);
+
+
+  /*int unallocate(Extent& ex) {  // skip limbo
+    return _release_merge(ex);
+  }
+  */
+
+  int commit_limbo();  // limbo -> fs->limbo_tab
+  int release_limbo(); // fs->limbo_tab -> free_tabs
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/BlockDevice.cc b/branches/sage/cephmds2/ebofs/BlockDevice.cc
new file mode 100644
index 0000000000000..5188946574643
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/BlockDevice.cc
@@ -0,0 +1,769 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "BlockDevice.h"
+
+#include "config.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+
+#include <sys/uio.h>
+
+#include <sys/ioctl.h>
+
+#ifndef __CYGWIN__
+#include <linux/fs.h>
+#endif
+
+
+/*******************************************
+ * biovec
+ */
+
+inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio)
+{
+  out << "bio(";
+  if (bio.type == BlockDevice::biovec::IO_READ) out << "rd ";
+  if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr ";
+  out << bio.start << "~" << bio.length;
+  if (bio.note) out << " " << bio.note;
+  out << " " << &bio;
+  out << ")";
+  return out;
+}
+
+
+
+/*******************************************
+ * ElevatorQueue
+ */
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq."
+#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq."
+
+
+int BlockDevice::ElevatorQueue::dequeue_io(list<biovec*>& biols, 
+                                           block_t& start, block_t& length,
+                                           interval_set<block_t>& block_lock)
+{
+  // queue empty?
+  assert(!io_map.empty());
+
+  dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl;
+
+  // find our position: i >= pos
+  map<block_t,biovec*>::iterator i;
+  
+  int tries = g_conf.bdev_el_bidir + 1;
+  while (tries > 0) {
+    if (el_dir_forward) {
+      i = io_map.lower_bound(el_pos);
+      if (i != io_map.end()) {
+        break;  // not at end.  good.
+      }
+    } else {
+      i = io_map.upper_bound(el_pos);
+      if (i != io_map.begin()) {
+        i--;   // and back down one (to get i <= pos).  good.
+        break;
+      }
+    }
+
+    // reverse (or initial startup)?
+    if (g_conf.bdev_el_bidir || !el_dir_forward) {
+      //      dout(20) << "restart reversing" << endl;
+      el_dir_forward = !el_dir_forward;
+    }
+    
+    if (el_dir_forward) {
+      // forward
+      el_pos = 0;
+      
+      if (g_conf.bdev_el_fw_max_ms) {
+        el_stop = g_clock.now();
+        utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms);  // (s,us), convert ms -> us!
+        el_stop += max;
+        //    dout(20) << "restart forward sweep for " << max << endl;
+      } else {
+        //    dout(20) << "restart fowrard sweep" << endl;
+      }
+    } else {
+      // reverse
+      el_pos = bdev->get_num_blocks();
+      
+      if (g_conf.bdev_el_bw_max_ms) {
+        el_stop = g_clock.now();
+        utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms);  // (s,us), convert ms -> us!
+        el_stop += max;
+        //    dout(20) << "restart reverse sweep for " << max << endl;
+      } else {
+        //    dout(20) << "restart reverse sweep" << endl;
+      }
+    }
+
+    tries--;
+  }
+  
+  assert(tries > 0);  // this shouldn't happen if the queue is non-empty.
+
+  // get some biovecs
+  int num_bio = 0;
+  
+  dout(20) << "dequeue_io  starting with " << i->first << " " << *i->second << endl;
+
+  // merge contiguous ops
+  char type = i->second->type;  // read or write
+  int num_iovs = 0;  // count eventual iov's for readv/writev
+  
+  start = i->first;
+  length = 0;
+
+  if (el_dir_forward)
+    el_pos = start;
+  else
+    el_pos = i->first + i->second->length;
+  
+  // while (contiguous)
+  while ((( el_dir_forward && el_pos == i->first) ||
+          (!el_dir_forward && el_pos == i->first + i->second->length)) && 
+         type == i->second->type) { 
+    biovec *bio = i->second;
+    
+    // allowed?  (not already submitted to kernel?)
+    if (block_lock.intersects(bio->start, bio->length)) {
+      //      dout(20) << "dequeue_io " << bio->start << "~" << bio->length 
+      //               << " intersects block_lock " << block_lock << endl;
+      break;  // stop, or go with what we've got so far
+    }
+    
+    // add to biols
+    int nv = bio->bl.buffers().size();     // how many iov's in this bio's bufferlist?
+    if (num_iovs + nv >= g_conf.bdev_iov_max) break;  // too many!
+    num_iovs += nv;
+    
+    start = MIN(start, bio->start);
+    length += bio->length;
+    
+    if (el_dir_forward) {
+      //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl;
+      biols.push_back(bio);      // add at back
+    } else {
+      //  dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl;
+      biols.push_front(bio);     // add at front
+    }
+    num_bio++;
+    
+    // move elevator pointer
+    bool at_end = false; 
+    map<block_t,biovec*>::iterator prev = i;
+    if (el_dir_forward) {
+      el_pos += bio->length;                 // cont. next would start right after us
+      i++;
+      if (i == io_map.end()) {
+        at_end = true;
+      }
+    } else {
+      el_pos -= bio->length;
+      if (i == io_map.begin()) {
+        at_end = true;
+      } else {
+        i--;
+      }
+    }
+    
+    // dequeue
+    io_map.erase(prev);
+    bio->in_queue = 0;
+    
+    if (at_end) break;
+  }
+  
+  return num_bio;
+}
+
+
+
+/*******************************************
+ * BarrierQueue
+ */
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq."
+
+void BlockDevice::BarrierQueue::barrier()
+{
+  if (!qls.empty() && qls.front()->empty()) {
+    assert(qls.size() == 1);
+    dout(10) << "barrier not adding new queue, front is empty" << endl;
+  } else {
+    qls.push_back(new ElevatorQueue(bdev, dev));
+    dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has "
+             << qls.front()->size() << " ios left" << endl;
+  }
+}
+
+bool BlockDevice::BarrierQueue::bump()
+{
+  assert(!qls.empty());
+  
+  // is the front queue empty?
+  if (qls.front()->empty() &&
+      qls.front() != qls.back()) {
+    delete qls.front();
+    qls.pop_front();
+    dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl;
+    return true;
+  }
+
+  return false;
+}
+
+int BlockDevice::BarrierQueue::dequeue_io(list<biovec*>& biols, 
+                                          block_t& start, block_t& length,
+                                          interval_set<block_t>& locked) 
+{
+  assert(!qls.empty());
+  int n = qls.front()->dequeue_io(biols, start, length, locked);
+  bump();  // in case we emptied the front queue
+  return n;
+}
+
+
+
+
+/*******************************************
+ * BlockDevice
+ */
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")."
+
+
+
+block_t BlockDevice::get_num_blocks() 
+{
+  if (!num_blocks) {
+    assert(fd > 0);
+
+#ifdef BLKGETSIZE64
+    // ioctl block device?
+    ioctl(fd, BLKGETSIZE64, &num_blocks);
+#endif
+
+    if (!num_blocks) {
+      // hmm, try stat!
+      struct stat st;
+      fstat(fd, &st);
+      num_blocks = st.st_size;
+    }
+    
+    num_blocks /= (__uint64_t)EBOFS_BLOCK_SIZE;
+
+    if (g_conf.bdev_fake_mb) {
+      num_blocks = g_conf.bdev_fake_mb * 256;
+      dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl;
+    }
+    if (g_conf.bdev_fake_max_mb &&
+        num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) {
+      dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl;
+      num_blocks = g_conf.bdev_fake_max_mb * 256;
+    }
+    
+  }
+  return num_blocks;
+}
+
+
+
+/** io thread
+ * each worker thread dequeues ios from the root_queue and submits them to the kernel.
+ */
+void* BlockDevice::io_thread_entry()
+{
+  lock.Lock();
+
+  int whoami = io_threads_started++;
+  io_threads_running++;
+  assert(io_threads_running <= g_conf.bdev_iothreads);
+  dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl;
+
+  // get my own fd (and file position pointer)
+  int fd = open_fd();
+  assert(fd > 0);
+
+  while (!io_stop) {
+    bool do_sleep = false;
+    
+    // queue empty?
+    if (root_queue.empty()) {
+      // sleep
+      do_sleep = true;
+    } else {
+      dout(20) << "io_thread" << whoami << " going" << endl;
+
+      block_t start, length;
+      list<biovec*> biols;
+      int n = root_queue.dequeue_io(biols, start, length, io_block_lock);
+
+      if (n == 0) {
+        // failed to dequeue a do-able op, sleep for now
+        dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl;
+        assert(io_threads_running > 1);   // there must be someone else, if we couldn't dequeue something doable.
+        do_sleep = true;
+      } 
+      else {
+        // lock blocks
+        assert(start == biols.front()->start);
+        io_block_lock.insert(start, length);
+          
+        // drop lock to do the io
+        lock.Unlock();
+        do_io(fd, biols);
+        lock.Lock();
+          
+        // unlock blocks
+        io_block_lock.erase(start, length);
+        
+        // someone might have blocked on our block_lock?
+        if (io_threads_running < g_conf.bdev_iothreads &&
+            (int)root_queue.size() > io_threads_running)   
+          io_wakeup.SignalAll();
+      }
+    }
+
+    if (do_sleep) {
+      do_sleep = false;
+      
+      // sleep
+      io_threads_running--;
+      dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running," 
+               << " queue has " << root_queue.size()               << endl;
+
+      if (g_conf.bdev_idle_kick_after_ms > 0 &&
+          io_threads_running == 0 && 
+          idle_kicker) {
+        // first wait for signal | timeout
+        io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000));   
+
+        // should we still be sleeping?  (did we get woken up, or did timer expire?
+        if (root_queue.empty() && io_threads_running == 0) {
+          idle_kicker->kick();          // kick
+          io_wakeup.Wait(lock);          // and wait
+        }
+      } else {
+        // normal, just wait.
+        io_wakeup.Wait(lock);
+      }
+
+      io_threads_running++;
+      assert(io_threads_running <= g_conf.bdev_iothreads);
+      dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl;
+    }
+  }
+
+  // clean up
+  ::close(fd);
+  io_threads_running--;
+  
+  lock.Unlock();
+
+  dout(10) << "io_thread" << whoami << " finish" << endl;
+  return 0;
+}
+
+
+
+/** do_io
+ * do a single io operation
+ * (lock is NOT held, but we own the *biovec)
+ */
+void BlockDevice::do_io(int fd, list<biovec*>& biols)
+{
+  int r;
+  assert(!biols.empty());
+
+  // get full range, type, bl
+  bufferlist bl;
+  bl.claim(biols.front()->bl);
+  block_t start = biols.front()->start;
+  block_t length = biols.front()->length;
+  char type = biols.front()->type;
+
+  list<biovec*>::iterator p = biols.begin();
+  int numbio = 1;
+  for (p++; p != biols.end(); p++) {
+    length += (*p)->length;
+    bl.claim_append((*p)->bl);
+    numbio++;
+  }
+
+  // do it
+  dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") 
+           << " " << start << "~" << length 
+           << " " << numbio << " bits" << endl;
+  if (type == biovec::IO_WRITE) {
+    r = _write(fd, start, length, bl);
+  } else if (type == biovec::IO_READ) {
+    r = _read(fd, start, length, bl);
+  } else assert(0);
+  dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") 
+           << " " << start << "~" << length << endl;
+  
+  // set rval
+  for (p = biols.begin(); p != biols.end(); p++)
+    (*p)->rval = r;
+
+  if (1) {
+    // put in completion queue
+    complete_lock.Lock();
+    complete_queue.splice( complete_queue.end(), biols );
+    complete_queue_len += numbio;
+    complete_wakeup.Signal();
+    complete_lock.Unlock();
+  } else {
+    // be slow and finish synchronously
+    for (p = biols.begin(); p != biols.end(); p++)
+      finish_io(*p);
+  }
+}
+
+
+/** finish_io
+ *
+ * finish an io by signaling the cond or performing a callback.
+ * called by completion thread, unless that's disabled above.
+ */
+void BlockDevice::finish_io(biovec *bio)
+{
+  bio->done = true;
+  if (bio->cond) {
+    bio->cond->Signal();
+  }
+  else if (bio->cb) {
+    bio->cb->finish((ioh_t)bio, bio->rval);
+    delete bio->cb;
+    delete bio;
+  }
+}
+
+/*** completion_thread
+ * handle Cond signals or callbacks for completed ios
+ */
+void* BlockDevice::complete_thread_entry()
+{
+  complete_lock.Lock();
+  dout(10) << "complete_thread start" << endl;
+
+  while (!io_stop) {
+
+    while (!complete_queue.empty()) {
+      list<biovec*> ls;
+      ls.swap(complete_queue);
+      dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl;
+      complete_queue_len = 0;
+      
+      complete_lock.Unlock();
+      
+      // finish
+      for (list<biovec*>::iterator p = ls.begin(); 
+           p != ls.end(); 
+           p++) {
+        biovec *bio = *p;
+        dout(20) << "complete_thread finishing " << *bio << endl;
+        finish_io(bio);
+      }
+      
+      complete_lock.Lock();
+    }
+    if (io_stop) break;
+    
+    /*
+    if (io_threads_running == 0 && idle_kicker) {
+      complete_lock.Unlock();
+      idle_kicker->kick();
+      complete_lock.Lock();
+      if (!complete_queue.empty() || io_stop) 
+        continue;
+    }
+    */
+
+    dout(25) << "complete_thread sleeping" << endl;
+    complete_wakeup.Wait(complete_lock);
+  }
+
+  dout(10) << "complete_thread finish" << endl;
+  complete_lock.Unlock();
+  return 0;
+}
+
+  
+
+
+// io queue
+
+void BlockDevice::_submit_io(biovec *b) 
+{
+  // NOTE: lock must be held
+  dout(15) << "_submit_io " << *b << endl;
+  
+  // wake up io_thread(s)?
+  if ((int)root_queue.size() == io_threads_running) 
+    io_wakeup.SignalOne();
+  else if ((int)root_queue.size() > io_threads_running) 
+    io_wakeup.SignalAll();
+    
+  // queue 
+  root_queue.submit_io(b);
+
+  /*
+  // [DEBUG] check for overlapping ios
+  // BUG: this doesn't detect all overlaps w/ the next queue thing.
+  if (g_conf.bdev_debug_check_io_overlap) {
+      // BUG: this doesn't catch everything!  eg 1~10000000 will be missed....
+      multimap<block_t, biovec*>::iterator p = io_queue.lower_bound(b->start);
+      if ((p != io_queue.end() &&
+           p->first < b->start+b->length) ||
+          (p != io_queue.begin() && 
+           (p--, p->second->start + p->second->length > b->start))) {
+        dout(1) << "_submit_io new io " << *b 
+                << " overlaps with existing " << *p->second << endl;
+        cerr << "_submit_io new io " << *b 
+             << " overlaps with existing " << *p->second << endl;
+      }
+    }
+  */
+
+}
+
+int BlockDevice::_cancel_io(biovec *bio) 
+{
+  // NOTE: lock must be held
+
+  if (bio->in_queue == 0) {
+    dout(15) << "_cancel_io " << *bio << " FAILED" << endl;
+    return -1;
+  } else {
+    dout(15) << "_cancel_io " << *bio << endl;
+    bio->in_queue->cancel_io(bio);
+    if (root_queue.bump()) 
+      io_wakeup.SignalAll();  // something happened!
+    return 0;
+  }
+}
+
+
+
+// low level io
+
+int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) 
+{
+  dout(10) << "_read " << bno << "~" << num << endl;
+
+  assert(fd > 0);
+  
+  off_t offset = bno * EBOFS_BLOCK_SIZE;
+  off_t actual = lseek(fd, offset, SEEK_SET);
+  assert(actual == offset);
+  
+  size_t len = num*EBOFS_BLOCK_SIZE;
+  assert(bl.length() >= len);
+
+  struct iovec iov[ bl.buffers().size() ];
+  int n = 0;
+  size_t left = len;
+  for (list<bufferptr>::const_iterator i = bl.buffers().begin();
+       i != bl.buffers().end();
+       i++) {
+    assert(i->length() % EBOFS_BLOCK_SIZE == 0);
+    
+    iov[n].iov_base = (void*)i->c_str();
+    iov[n].iov_len = MIN(left, i->length());
+
+    left -= iov[n].iov_len;
+    n++;
+    if (left == 0) break;
+  }
+
+  int got = ::readv(fd, iov, n);
+  assert(got <= (int)len);
+  
+  return 0;
+}
+
+int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) 
+{
+  dout(10) << "_write " << bno << "~" << num << endl;
+
+  assert(fd > 0);
+  
+  off_t offset = (off_t)bno << EBOFS_BLOCK_BITS;
+  assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset);
+  off_t actual = lseek(fd, offset, SEEK_SET);
+  assert(actual == offset);
+  
+  // write buffers
+  size_t len = num*EBOFS_BLOCK_SIZE;
+
+  struct iovec iov[ bl.buffers().size() ];
+
+  int n = 0;
+  size_t left = len;
+  for (list<bufferptr>::const_iterator i = bl.buffers().begin();
+       i != bl.buffers().end();
+       i++) {
+    assert(i->length() % EBOFS_BLOCK_SIZE == 0);
+
+    iov[n].iov_base = (void*)i->c_str();
+    iov[n].iov_len = MIN(left, i->length());
+
+    assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0);
+    assert((iov[n].iov_len & 4095) == 0);
+    
+    left -= iov[n].iov_len;
+    n++;
+    if (left == 0) break;
+  }
+
+  int r = ::writev(fd, iov, n);
+
+  if (r < 0) {
+    dout(1) << "couldn't write bno " << bno << " num " << num 
+            << " (" << len << " bytes) in " << n << " iovs,  r=" << r 
+            << " errno " << errno << " " << strerror(errno) << endl;
+    dout(1) << "bl is " << bl << endl;
+    assert(0);
+  } else {
+    assert(r == (int)len);
+  }
+  
+  return 0;
+}
+
+
+
+// open/close
+
+int BlockDevice::open_fd()
+{
+  return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0);
+}
+
+int BlockDevice::open(kicker *idle) 
+{
+  assert(fd == 0);
+
+  // open?
+  fd = open_fd();
+  if (fd < 0) {
+    dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl;
+    fd = 0;
+    return -1;
+  }
+
+  // lock
+  if (g_conf.bdev_lock) {
+    int r = ::flock(fd, LOCK_EX|LOCK_NB);
+    if (r < 0) {
+      derr(1) << "open " << dev << " failed to get LOCK_EX" << endl;
+      assert(0);
+      return -1;
+    }
+  }
+               
+  // figure size
+  __uint64_t bsize = get_num_blocks();
+  
+  dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl;
+  
+  // start thread
+  io_threads_started = 0;
+  io_threads.clear();
+  for (int i=0; i<g_conf.bdev_iothreads; i++) {
+    io_threads.push_back(new IOThread(this));
+    io_threads.back()->create();
+  }
+  complete_thread.create();
+ 
+  // idle kicker?
+  idle_kicker = idle;
+
+  return fd;
+}
+
+
+int BlockDevice::close() 
+{
+  assert(fd>0);
+  
+  idle_kicker = 0;
+
+  // shut down io thread
+  dout(10) << "close stopping io+complete threads" << endl;
+  lock.Lock();
+  complete_lock.Lock();
+  io_stop = true;
+  io_wakeup.SignalAll();
+  complete_wakeup.SignalAll();
+  complete_lock.Unlock();
+  lock.Unlock();    
+  
+  
+  for (int i=0; i<g_conf.bdev_iothreads; i++) {
+    io_threads[i]->join();
+    delete io_threads[i];
+  }
+  io_threads.clear();
+
+  complete_thread.join();
+
+  io_stop = false;   // in case we start again
+
+  dout(2) << "close " << endl;
+
+  if (g_conf.bdev_lock)
+    ::flock(fd, LOCK_UN);
+
+  ::close(fd);
+  fd = 0;
+
+  return 0;
+}
+
+int BlockDevice::cancel_io(ioh_t ioh) 
+{
+  biovec *pbio = (biovec*)ioh;
+  
+  lock.Lock();
+  int r = _cancel_io(pbio);
+  lock.Unlock();
+  
+  // FIXME?
+  if (r == 0 && pbio->cb) {
+    //pbio->cb->finish(ioh, 0);
+    delete pbio->cb;
+    delete pbio;
+  }
+  
+  return r;
+}
+
diff --git a/branches/sage/cephmds2/ebofs/BlockDevice.h b/branches/sage/cephmds2/ebofs/BlockDevice.h
new file mode 100644
index 0000000000000..25adf62606947
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/BlockDevice.h
@@ -0,0 +1,331 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_BLOCKDEVICE_H
+#define __EBOFS_BLOCKDEVICE_H
+
+#include "include/buffer.h"
+#include "include/interval_set.h"
+#include "include/Context.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "types.h"
+
+
+typedef void *ioh_t;    // opaque handle to an io request.  (in actuality, a biovec*)
+
+
+class BlockDevice {
+ public:
+  // callback type for io completion notification
+  class callback {
+  public:
+    virtual ~callback() {}
+    virtual void finish(ioh_t ioh, int rval) = 0;
+  };
+
+  // kicker for idle notification
+  class kicker {
+  public:
+    virtual ~kicker() {}
+    virtual void kick() = 0;
+  };
+  
+  
+  /********************************************************/
+
+  class Queue;
+
+  // io item
+  // two variants: one with Cond*, one with callback*.
+  class biovec {
+  public:
+    static const char IO_WRITE = 1;
+    static const char IO_READ = 2;
+
+    char type;
+    block_t start, length;
+    bufferlist bl;
+    callback *cb;
+    Cond *cond;
+    int rval;
+    char *note;
+    bool done;
+
+    Queue *in_queue;
+
+    biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) :
+      type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {}
+    biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) :
+      type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {}
+  };
+  friend ostream& operator<<(ostream& out, biovec &bio);
+
+
+  /********************************************************/
+
+  /*
+   * Queue -- abstract IO queue interface
+   */
+  class Queue {
+  public:
+    virtual ~Queue() {}
+    virtual void submit_io(biovec *b) = 0;
+    virtual void cancel_io(biovec *b) = 0;
+    virtual int dequeue_io(list<biovec*>& biols, 
+                           block_t& start, block_t& length,
+                           interval_set<block_t>& locked) = 0;
+    virtual int size() = 0;
+    virtual bool empty() { return size() == 0; }
+  };
+  
+  /*
+   * ElevatorQueue - simple elevator scheduler queue
+   */
+  class ElevatorQueue : public Queue {
+    BlockDevice *bdev;
+    const char *dev;
+    map<block_t, biovec*> io_map;
+    bool    el_dir_forward;
+    block_t el_pos;
+    utime_t el_stop;
+    
+  public:
+    ElevatorQueue(BlockDevice *bd, const char *d) :
+      bdev(bd), dev(d), 
+      el_dir_forward(false),
+      el_pos(0) {}
+    void submit_io(biovec *b) {
+      b->in_queue = this;
+      assert(io_map.count(b->start) == 0);
+      io_map[b->start] = b;
+    }
+    void cancel_io(biovec *b) {
+      assert(b->in_queue == this);
+      assert(io_map.count(b->start) &&
+             io_map[b->start] == b);
+      io_map.erase(b->start);
+      b->in_queue = 0;
+    }
+    int dequeue_io(list<biovec*>& biols, 
+                   block_t& start, block_t& length,
+                   interval_set<block_t>& locked);
+    int size() {
+      return io_map.size();
+    }
+  };
+
+  /*
+   * BarrierQueue - lets you specify io "barriers"
+   *  barrier() - force completion of all prior IOs before
+   *    future ios are started.
+   *  bump()    - must be called after cancel_io to properly
+   *    detect empty subqueue.
+   */
+  class BarrierQueue : public Queue {
+    BlockDevice *bdev;    
+    const char *dev;
+    list<Queue*> qls; 
+  public:
+    BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) {
+      barrier();
+    }
+    int size() {
+      // this isn't perfectly accurate.
+      if (!qls.empty())
+        return qls.front()->size();
+      return 0;
+    }
+    void submit_io(biovec *b) {
+      assert(!qls.empty());
+      qls.back()->submit_io(b);
+    }
+    void cancel_io(biovec *b) {
+      assert(0);  // shouldn't happen.
+    }
+    int dequeue_io(list<biovec*>& biols, 
+                   block_t& start, block_t& length,
+                   interval_set<block_t>& locked);
+    void barrier();
+    bool bump();
+  };
+
+  
+ private:
+  string  dev;           // my device file
+  int     fd;
+  block_t num_blocks;
+
+  Mutex lock;
+
+  /** the root io queue. 
+   * i current assumeit's a barrier queue,but this can be changed
+   * with some minor rearchitecting.
+   */
+  BarrierQueue root_queue;
+
+  kicker *idle_kicker;  // not used..
+
+  /* io_block_lock - block ranges current dispatched to kernel
+   *  once a bio is dispatched, it cannot be canceled, so an overlapping
+   *  io and be submitted.  the overlapping io cannot be dispatched 
+   *  to the kernel, however, until the original io finishes, or else
+   *  there will be a race condition.
+   */
+  interval_set<block_t>      io_block_lock;    // blocks currently dispatched to kernel
+
+  // io threads
+  Cond                       io_wakeup;
+  bool                       io_stop;
+  int                        io_threads_started, io_threads_running;
+  
+  void *io_thread_entry();
+
+  class IOThread : public Thread {
+    BlockDevice *dev;
+  public:
+    IOThread(BlockDevice *d) : dev(d) {}
+    void *entry() { return (void*)dev->io_thread_entry(); }
+  } ;
+
+  vector<IOThread*> io_threads;
+
+  // private io interface
+  int open_fd();  // get an fd (for a thread)
+
+  void _submit_io(biovec *b);
+  int _cancel_io(biovec *bio);
+  void do_io(int fd, list<biovec*>& biols);   // called by an io thread
+
+  // low level io
+  int _read(int fd, block_t bno, unsigned num, bufferlist& bl);
+  int _write(int fd, unsigned bno, unsigned num, bufferlist& bl);
+
+
+  // completion callback queue
+  Mutex          complete_lock;
+  Cond           complete_wakeup;
+  list<biovec*>  complete_queue;
+  int            complete_queue_len;
+  
+  void finish_io(biovec *bio);
+
+  // complete thread
+  void *complete_thread_entry();
+  class CompleteThread : public Thread {
+    BlockDevice *dev;
+  public:
+    CompleteThread(BlockDevice *d) : dev(d) {}
+    void *entry() { return (void*)dev->complete_thread_entry(); }
+  } complete_thread;
+
+
+ public:
+  BlockDevice(const char *d) : 
+    dev(d), fd(0), num_blocks(0),
+    root_queue(this, dev.c_str()),
+    idle_kicker(0),
+    io_stop(false), io_threads_started(0), io_threads_running(0),
+    complete_queue_len(0),
+    complete_thread(this) { }
+  ~BlockDevice() {
+    if (fd > 0) close();
+  }
+
+  // get size in blocks
+  block_t get_num_blocks();
+  const char *get_device_name() const { return dev.c_str(); }
+
+  // open/close
+  int open(kicker *idle = 0);
+  int close();
+
+  // state stuff
+  bool is_idle() {
+    lock.Lock();
+    bool idle = (io_threads_running == 0) && root_queue.empty();
+    lock.Unlock();
+    return idle;
+  }
+  void barrier() {
+    lock.Lock();
+    root_queue.barrier();
+    lock.Unlock();
+  }
+
+  // ** blocking interface **
+
+  // read
+  int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) {
+    bufferlist bl;
+    bl.push_back(bptr);
+    return read(bno, num, bl, n);
+  }
+  int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) {
+    Cond c;
+    biovec bio(biovec::IO_READ, bno, num, bl, &c, n);
+    
+    lock.Lock();
+    _submit_io(&bio);
+    barrier();         // need this, to prevent starvation!
+    while (!bio.done) 
+      c.Wait(lock);
+    lock.Unlock();
+    return bio.rval;
+  }
+
+  // write
+  int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) {
+    bufferlist bl;
+    bl.push_back(bptr);
+    return write(bno, num, bl, n);
+  }
+  int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) {
+    Cond c;
+    biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n);
+
+    lock.Lock();
+    _submit_io(&bio);
+    barrier();         // need this, to prevent starvation!
+    while (!bio.done) 
+      c.Wait(lock);
+    lock.Unlock();
+    return bio.rval;
+  }
+
+  // ** non-blocking interface **
+  ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) {
+    biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n);
+    lock.Lock();
+    _submit_io(pbio);
+    lock.Unlock();
+    return (ioh_t)pbio;
+  }
+  ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) {
+    biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n);
+    lock.Lock();
+    _submit_io(pbio);
+    lock.Unlock();
+    return (ioh_t)pbio;
+  }
+  int cancel_io(ioh_t ioh);
+
+};
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/BufferCache.cc b/branches/sage/cephmds2/ebofs/BufferCache.cc
new file mode 100644
index 0000000000000..cee7f2c12ce79
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/BufferCache.cc
@@ -0,0 +1,1045 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "BufferCache.h"
+#include "Onode.h"
+
+
+/*********** BufferHead **************/
+
+
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_ebofs) cout << "ebofs.bh."
+
+
+
+
+
+
+/************ ObjectCache **************/
+
+
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_ebofs) cout << "ebofs.oc."
+
+
+
+void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl)
+{
+  list<Context*> waiters;
+
+  dout(10) << "rx_finish " << start << "~" << length << endl;
+  for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+       p != data.end(); 
+       p++) {
+    BufferHead *bh = p->second;
+    dout(10) << "rx_finish ?" << *bh << endl;
+    assert(p->first == bh->start());
+
+    // past?
+    if (p->first >= start+length) break;
+    if (bh->end() > start+length) break;  // past
+    
+    assert(p->first >= start);
+    assert(bh->end() <= start+length);
+
+    dout(10) << "rx_finish !" << *bh << endl;
+
+    if (bh->rx_ioh == ioh)
+      bh->rx_ioh = 0;
+
+    if (bh->is_rx()) {
+      assert(bh->get_version() == 0);
+      assert(bh->end() <= start+length);
+      assert(bh->start() >= start);
+      dout(10) << "rx_finish  rx -> clean on " << *bh << endl;
+      bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE);
+      bc->mark_clean(bh);
+    }
+    else if (bh->is_partial()) {
+      dout(10) << "rx_finish  partial -> tx on " << *bh << endl;      
+
+      if (1) {
+        // double-check what block i am
+        vector<Extent> exv;
+        on->map_extents(bh->start(), 1, exv);
+        assert(exv.size() == 1);
+        block_t cur_block = exv[0].start;
+        assert(cur_block == bh->partial_tx_to);
+      }
+      
+      // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves)
+      bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
+      
+      // apply partial to myself
+      assert(bh->data.length() == 0);
+      bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+      bh->data.push_back( bp );
+      bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl);
+      bh->apply_partial();
+      
+      // write "normally"
+      bc->mark_dirty(bh);
+      bc->bh_write(on, bh, bh->partial_tx_to);//cur_block);
+
+      // clean up a bit
+      bh->partial_tx_to = 0;
+      bh->partial_tx_epoch = 0;
+      bh->partial.clear();
+    }
+    else {
+      dout(10) << "rx_finish  ignoring status on (dirty|tx|clean) " << *bh << endl;
+      assert(bh->is_dirty() ||  // was overwritten
+             bh->is_tx() ||     // was overwritten and queued
+             bh->is_clean());   // was overwritten, queued, _and_ flushed to disk
+    }
+
+    // trigger waiters
+    for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+         p != bh->waitfor_read.end();
+         p++) {
+      assert(p->first >= bh->start() && p->first < bh->end());
+      waiters.splice(waiters.begin(), p->second);
+    }
+    bh->waitfor_read.clear();
+  }    
+
+  finish_contexts(waiters);
+}
+
+
+void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, 
+                            version_t version, version_t epoch)
+{
+  dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl;
+  for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+       p != data.end(); 
+       p++) {
+    BufferHead *bh = p->second;
+    dout(30) << "tx_finish ?bh " << *bh << endl;
+    assert(p->first == bh->start());
+
+    // past?
+    if (p->first >= start+length) break;
+
+    if (bh->tx_ioh == ioh)
+      bh->tx_ioh = 0;
+
+    if (!bh->is_tx()) {
+      dout(10) << "tx_finish  bh not marked tx, skipping" << endl;
+      continue;
+    }
+    assert(bh->is_tx());
+    
+    if (version == bh->version) {
+      dout(10) << "tx_finish  tx -> clean on " << *bh << endl;
+      assert(bh->end() <= start+length);
+      bh->set_last_flushed(version);
+      bc->mark_clean(bh);
+    } else {
+      dout(10) << "tx_finish  leaving tx, " << bh->version << " > " << version 
+               << " on " << *bh << endl;
+      assert(bh->version > version);
+    }
+  }    
+}
+
+
+
+/*
+ * return any bh's that are (partially) in this range that are TX.
+ */
+int ObjectCache::find_tx(block_t start, block_t len,
+                         list<BufferHead*>& tx)
+{
+  map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+  block_t cur = start;
+  block_t left = len;
+  
+  /* don't care about overlap, we want things _fully_ in start~len.
+  if (p != data.begin() && 
+      (p == data.end() || p->first > cur)) {
+    p--;     // might overlap!
+    if (p->first + p->second->length() <= cur) 
+      p++;   // doesn't overlap.
+  }
+  */
+
+  while (left > 0) {
+    assert(cur+left == start+len);
+
+    // at end?
+    if (p == data.end()) 
+      break;
+
+    if (p->first <= cur) {
+      // have it (or part of it)
+      BufferHead *e = p->second;
+
+      if (e->end() <= start+len &&
+          e->is_tx()) 
+        tx.push_back(e);
+      
+      block_t lenfromcur = MIN(e->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      p++;
+      continue;  // more?
+    } else if (p->first > cur) {
+      // gap.. miss
+      block_t next = p->first;
+      left -= (next-cur);
+      cur = next;
+      continue;
+    }
+    else 
+      assert(0);
+  }
+
+  return 0;  
+}
+
+
+
+/*
+ * map a range of blocks into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ *  - fragment along disk extent boundaries
+ */
+int ObjectCache::map_read(block_t start, block_t len, 
+                          map<block_t, BufferHead*>& hits,
+                          map<block_t, BufferHead*>& missing,
+                          map<block_t, BufferHead*>& rx,
+                          map<block_t, BufferHead*>& partial) {
+  
+  map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+  block_t cur = start;
+  block_t left = len;
+  
+  if (p != data.begin() && 
+      (p == data.end() || p->first > cur)) {
+    p--;     // might overlap!
+    if (p->first + p->second->length() <= cur) 
+      p++;   // doesn't overlap.
+  }
+
+  while (left > 0) {
+    // at end?
+    if (p == data.end()) {
+      // rest is a miss.
+      vector<Extent> exv;
+      //on->map_extents(cur, left, exv);          // we might consider some prefetch here.
+      on->map_extents(cur, 
+                      //MIN(left + g_conf.ebofs_max_prefetch,   // prefetch
+                      //on->object_blocks-cur),  
+                      left,   // no prefetch
+                      exv);
+      for (unsigned i=0; i<exv.size() && left > 0; i++) {
+        BufferHead *n = new BufferHead(this);
+        n->set_start( cur );
+        n->set_length( exv[i].length );
+        bc->add_bh(n);
+        missing[cur] = n;
+        dout(20) << "map_read miss " << left << " left, " << *n << endl;
+        cur += MIN(left,exv[i].length);
+        left -= MIN(left,exv[i].length);
+      }
+      assert(left == 0);
+      assert(cur == start+len);
+      break;
+    }
+    
+    if (p->first <= cur) {
+      // have it (or part of it)
+      BufferHead *e = p->second;
+      
+      if (e->is_clean() ||
+          e->is_dirty() ||
+          e->is_tx()) {
+        hits[cur] = e;     // readable!
+        dout(20) << "map_read hit " << *e << endl;
+        bc->touch(e);
+      } 
+      else if (e->is_rx()) {
+        rx[cur] = e;       // missing, not readable.
+        dout(20) << "map_read rx " << *e << endl;
+      }
+      else if (e->is_partial()) {
+        partial[cur] = e;
+        dout(20) << "map_read partial " << *e << endl;
+      }
+      else {
+	dout(0) << "map_read ??? " << *e << endl;
+	assert(0);
+      }
+      
+      block_t lenfromcur = MIN(e->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      p++;
+      continue;  // more?
+    } else if (p->first > cur) {
+      // gap.. miss
+      block_t next = p->first;
+      vector<Extent> exv;
+      on->map_extents(cur, 
+                      //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch,   // prefetch
+                      //                on->object_blocks-cur)),  
+                      MIN(next-cur, left),   // no prefetch
+                      exv);
+      
+      for (unsigned i=0; i<exv.size() && left>0; i++) {
+        BufferHead *n = new BufferHead(this);
+        n->set_start( cur );
+        n->set_length( exv[i].length );
+        bc->add_bh(n);
+        missing[cur] = n;
+        cur += MIN(left, n->length());
+        left -= MIN(left, n->length());
+        dout(20) << "map_read gap " << *n << endl;
+      }
+      continue;    // more?
+    }
+    else 
+      assert(0);
+  }
+
+  assert(left == 0);
+  assert(cur == start+len);
+  return 0;  
+}
+
+
+/*
+ * map a range of pages on an object's buffer cache.
+ *
+ * - break up bufferheads that don't fall completely within the range
+ * - cancel rx ops we obsolete.
+ *   - resubmit rx ops if we split bufferheads
+ *
+ * - leave potentially obsoleted tx ops alone (for now)
+ * - don't worry about disk extent boundaries (yet)
+ */
+int ObjectCache::map_write(block_t start, block_t len,
+                           interval_set<block_t>& alloc,
+                           map<block_t, BufferHead*>& hits,
+                           version_t super_epoch)
+{
+  map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+  dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl;
+  // p->first >= start
+  
+  block_t cur = start;
+  block_t left = len;
+  
+  if (p != data.begin() && 
+      (p == data.end() || p->first > cur)) {
+    p--;     // might overlap!
+    if (p->first + p->second->length() <= cur) 
+      p++;   // doesn't overlap.
+  }
+
+  //dump();
+
+  while (left > 0) {
+    // max for this bh (bc of (re)alloc on disk)
+    block_t max = left;
+    bool newalloc = false;
+
+    // based on alloc/no-alloc boundary ...
+    if (alloc.contains(cur, left)) {
+      if (alloc.contains(cur)) {
+        block_t ends = alloc.end_after(cur);
+        max = MIN(left, ends-cur);
+        newalloc = true;
+      } else {
+        if (alloc.starts_after(cur)) {
+          block_t st = alloc.start_after(cur);
+          max = MIN(left, st-cur);
+        } 
+      }
+    } 
+
+    // based on disk extent boundary ...
+    vector<Extent> exv;
+    on->map_extents(cur, max, exv);
+    if (exv.size() > 1) 
+      max = exv[0].length;
+
+    if (newalloc) {
+      dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl;
+    } else {
+      dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl;
+    }
+    
+    // at end?
+    if (p == data.end()) {
+      BufferHead *n = new BufferHead(this);
+      n->set_start( cur );
+      n->set_length( max );
+      bc->add_bh(n);
+      hits[cur] = n;
+      left -= max;
+      cur += max;
+      continue;
+    }
+    
+    dout(10) << "p is " << *p->second << endl;
+
+
+    if (p->first <= cur) {
+      BufferHead *bh = p->second;
+      dout(10) << "map_write bh " << *bh << " intersected" << endl;
+
+      if (p->first < cur) {
+        if (cur+max >= p->first+p->second->length()) {
+          // we want right bit (one splice)
+          if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+            BufferHead *right = bc->split(bh, cur);
+            bc->bh_read(on, bh);          // reread left bit
+            bh = right;
+          } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+            BufferHead *right = bc->split(bh, cur);
+            bc->bh_write(on, bh);          // rewrite left bit
+            bh = right;
+          } else {
+            bh = bc->split(bh, cur);   // just split it
+          }
+          p++;
+          assert(p->second == bh);
+        } else {
+          // we want middle bit (two splices)
+          if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+            BufferHead *middle = bc->split(bh, cur);
+            bc->bh_read(on, bh);                       // reread left
+            p++;
+            assert(p->second == middle);
+            BufferHead *right = bc->split(middle, cur+max);
+            bc->bh_read(on, right);                    // reread right
+            bh = middle;
+          } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+            BufferHead *middle = bc->split(bh, cur);
+            bc->bh_write(on, bh);                       // redo left
+            p++;
+            assert(p->second == middle);
+            BufferHead *right = bc->split(middle, cur+max);
+            bc->bh_write(on, right);                    // redo right
+            bh = middle;
+          } else {
+            BufferHead *middle = bc->split(bh, cur);
+            p++;
+            assert(p->second == middle);
+            bc->split(middle, cur+max);
+            bh = middle;
+          }
+        }
+      } else if (p->first == cur) {
+        if (p->second->length() <= max) {
+          // whole bufferhead, piece of cake.
+        } else {
+          // we want left bit (one splice)
+          if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+            BufferHead *right = bc->split(bh, cur+max);
+            bc->bh_read(on, right);              // re-rx the right bit
+          } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+            BufferHead *right = bc->split(bh, cur+max);
+            bc->bh_write(on, right);              // re-tx the right bit
+          } else {
+            bc->split(bh, cur+max);        // just split
+          }
+        }
+      }
+      
+      // try to cancel tx?
+      if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch);
+            
+      // put in our map
+      hits[cur] = bh;
+
+      // keep going.
+      block_t lenfromcur = bh->end() - cur;
+      cur += lenfromcur;
+      left -= lenfromcur;
+      p++;
+      continue; 
+    } else {
+      // gap!
+      block_t next = p->first;
+      block_t glen = MIN(next-cur, max);
+      dout(10) << "map_write gap " << cur << "~" << glen << endl;
+      BufferHead *n = new BufferHead(this);
+      n->set_start( cur );
+      n->set_length( glen );
+      bc->add_bh(n);
+      hits[cur] = n;
+      
+      cur += glen;
+      left -= glen;
+      continue;    // more?
+    }
+  }
+
+  assert(left == 0);
+  assert(cur == start+len);
+  return 0;
+}
+
+/* don't need this.
+int ObjectCache::scan_versions(block_t start, block_t len,
+                               version_t& low, version_t& high)
+{
+  map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+  // p->first >= start
+  
+  if (p != data.begin() && p->first > start) {
+    p--;     // might overlap?
+    if (p->first + p->second->length() <= start) 
+      p++;   // doesn't overlap.
+  }
+  if (p->first >= start+len) 
+    return -1;  // to the right.  no hits.
+  
+  // start
+  low = high = p->second->get_version();
+
+  for (p++; p != data.end(); p++) {
+    // past?
+    if (p->first >= start+len) break;
+    
+    const version_t v = p->second->get_version();
+    if (low > v) low = v;
+    if (high < v) high = v;
+  }    
+
+  return 0;
+}
+*/
+
+void ObjectCache::truncate(block_t blocks, version_t super_epoch)
+{
+  dout(7) << "truncate " << object_id 
+           << " " << blocks << " blocks"
+           <<  endl;
+
+  while (!data.empty()) {
+    block_t bhoff = data.rbegin()->first;
+    BufferHead *bh = data.rbegin()->second;
+
+    if (bh->end() <= blocks) break;
+
+    bool uncom = on->uncommitted.contains(bh->start(), bh->length());
+    dout(10) << "truncate " << *bh << " uncom " << uncom 
+             << " of " << on->uncommitted
+             << endl;
+    
+    if (bhoff < blocks) {
+      // we want right bit (one splice)
+      if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+        BufferHead *right = bc->split(bh, blocks);
+        bc->bh_read(on, bh);          // reread left bit
+        bh = right;
+      } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) {
+        BufferHead *right = bc->split(bh, blocks);
+        bc->bh_write(on, bh);          // rewrite left bit
+        bh = right;
+      } else {
+        bh = bc->split(bh, blocks);   // just split it
+      }
+      // no worries about partials up here, they're always 1 block (and thus never split)
+    } else {
+      // whole thing
+      // cancel any pending/queued io, if possible.
+      if (bh->is_rx())
+        bc->bh_cancel_read(bh);
+      if (bh->is_tx() && uncom) 
+        bc->bh_cancel_write(bh, super_epoch);
+      if (bh->shadow_of) {
+	dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl;
+	// shadow
+	bh->shadow_of->remove_shadow(bh);
+	if (bh->is_partial()) 
+	  bc->cancel_shadow_partial(bh->rx_from.start, bh);
+      } else {
+	// normal
+	if (bh->is_partial() && uncom)
+	  bc->bh_cancel_partial_write(bh);
+      }
+    }
+    
+    for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+         p != bh->waitfor_read.end();
+         p++) {
+      finish_contexts(p->second, -1);
+    }
+
+    bc->remove_bh(bh);
+    delete bh;
+  }
+}
+
+
+void ObjectCache::clone_to(Onode *other)
+{
+  ObjectCache *ton = 0;
+
+  for (map<block_t, BufferHead*>::iterator p = data.begin();
+       p != data.end();
+       p++) {
+    BufferHead *bh = p->second;
+    dout(10) << "clone_to ? " << *bh << endl;
+    if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) {
+      // dup dirty or tx bh's
+      if (!ton)
+	ton = other->get_oc(bc);
+      BufferHead *nbh = new BufferHead(ton);
+      nbh->set_start( bh->start() );
+      nbh->set_length( bh->length() );
+      nbh->data = bh->data;      // just copy refs to underlying buffers. 
+      bc->add_bh(nbh);
+
+      if (bh->is_partial()) {
+	dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl;
+	nbh->partial = bh->partial;
+	bc->mark_partial(nbh);
+	// register as shadow_partial
+	bc->add_shadow_partial(bh->rx_from.start, nbh);
+      } else {
+	// clean buffer will shadow
+	bh->add_shadow(nbh);
+	bc->mark_clean(nbh);
+      }
+
+      dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl;
+    } 
+  }
+}
+
+
+
+/************** BufferCache ***************/
+
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_ebofs) cout << "ebofs.bc."
+
+
+
+BufferHead *BufferCache::split(BufferHead *orig, block_t after) 
+{
+  dout(20) << "split " << *orig << " at " << after << endl;
+
+  // split off right
+  BufferHead *right = new BufferHead(orig->get_oc());
+  right->set_version(orig->get_version());
+  right->epoch_modified = orig->epoch_modified;
+  right->last_flushed = orig->last_flushed;
+  right->set_state(orig->get_state());
+
+  block_t newleftlen = after - orig->start();
+  right->set_start( after );
+  right->set_length( orig->length() - newleftlen );
+  
+  // shorten left
+  stat_sub(orig);
+  orig->set_length( newleftlen );
+  stat_add(orig);
+
+  // add right
+  add_bh(right);
+
+  // adjust rx_from
+  if (orig->is_rx()) {
+    right->rx_from = orig->rx_from;
+    orig->rx_from.length = newleftlen;
+    right->rx_from.length -= newleftlen;
+    right->rx_from.start += newleftlen;
+  }
+
+  // dup shadows
+  for (set<BufferHead*>::iterator p = orig->shadows.begin();
+       p != orig->shadows.end();
+       ++p)
+    right->add_shadow(*p);
+
+  // split buffers too
+  bufferlist bl;
+  bl.claim(orig->data);
+  if (bl.length()) {
+    assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE);
+    right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE);
+    orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE);
+  }
+
+  // move read waiters
+  if (!orig->waitfor_read.empty()) {
+    map<block_t, list<Context*> >::iterator o, p = orig->waitfor_read.end();
+    p--;
+    while (p != orig->waitfor_read.begin()) {
+      if (p->first < right->start()) break;      
+      dout(0) << "split  moving waiters at block " << p->first << " to right bh" << endl;
+      right->waitfor_read[p->first].swap( p->second );
+      o = p;
+      p--;
+      orig->waitfor_read.erase(o);
+    }
+  }
+  
+  dout(20) << "split    left is " << *orig << endl;
+  dout(20) << "split   right is " << *right << endl;
+  return right;
+}
+
+
+void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from)
+{
+  dout(10) << "bh_read " << *on << " on " << *bh << endl;
+
+  if (bh->is_missing())    {
+    mark_rx(bh);
+  } else {
+    assert(bh->is_partial());
+  }
+  
+  // get extent.  there should be only one!
+  vector<Extent> exv;
+  on->map_extents(bh->start(), bh->length(), exv);
+  assert(exv.size() == 1);
+  Extent ex = exv[0];
+
+  if (from) {  // force behavior, used for reading partials
+    dout(10) << "bh_read  forcing read from block " << from << " (for a partial)" << endl;
+    ex.start = from;
+    ex.length = 1;
+  }
+    
+  // this should be empty!!
+  assert(bh->rx_ioh == 0);
+  
+  dout(20) << "bh_read  " << *bh << " from " << ex << endl;
+  
+  C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, 
+                                         bh->start(), bh->length(),
+                                         ex.start);
+
+  //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl);  // new buffers!
+  fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+
+  bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl,
+                        fin);
+  bh->rx_from = ex;
+  on->oc->get();
+
+}
+
+bool BufferCache::bh_cancel_read(BufferHead *bh)
+{
+  if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) {
+    dout(10) << "bh_cancel_read on " << *bh << endl;
+    bh->rx_ioh = 0;
+    mark_missing(bh);
+    int l = bh->oc->put();
+    assert(l);
+    return true;
+  }
+  return false;
+}
+
+void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe)
+{
+  dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl;
+  assert(bh->get_version() > 0);
+
+  assert(bh->is_dirty());
+  mark_tx(bh);
+  
+  // get extents
+  vector<Extent> exv;
+  on->map_extents(bh->start(), bh->length(), exv);
+  assert(exv.size() == 1);
+  Extent ex = exv[0];
+
+  if (shouldbe)
+    assert(ex.length == 1 && ex.start == shouldbe);
+
+  dout(20) << "bh_write  " << *bh << " to " << ex << endl;
+
+  //assert(bh->tx_ioh == 0);
+
+  assert(bh->get_last_flushed() < bh->get_version());
+
+  bh->tx_block = ex.start;
+  bh->tx_ioh = dev.write(ex.start, ex.length, bh->data,
+                         new C_OC_TxFinish(ebofs_lock, on->oc, 
+                                           bh->start(), bh->length(),
+                                           bh->get_version(),
+                                           bh->epoch_modified),
+                         "bh_write");
+
+  on->oc->get();
+  inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified );
+
+  /*
+  // assert: no partials on the same block
+  // hose any partial on the same block
+  if (bh->partial_write.count(ex.start)) {
+    dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl;
+    dec_unflushed( bh->partial_write[ex.start].epoch );
+    bh->partial_write.erase(ex.start);
+  }
+  */
+}
+
+
+bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch)
+{
+  if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) {
+    dout(10) << "bh_cancel_write on " << *bh << endl;
+    bh->tx_ioh = 0;
+    mark_dirty(bh);
+
+    assert(bh->epoch_modified == cur_epoch);
+    assert(bh->epoch_modified > 0);
+    dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified );   // assert.. this should be the same epoch!
+
+    int l = bh->oc->put();
+    assert(l);
+    return true;
+  }
+  return false;
+}
+
+void BufferCache::tx_finish(ObjectCache *oc, 
+                            ioh_t ioh, block_t start, block_t length, 
+                            version_t version, version_t epoch)
+{
+  ebofs_lock.Lock();
+
+  // finish oc
+  if (oc->put() == 0) {
+    delete oc;
+  } else
+    oc->tx_finish(ioh, start, length, version, epoch);
+  
+  // update unflushed counter
+  assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0);
+  dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch);
+
+  ebofs_lock.Unlock();
+}
+
+void BufferCache::rx_finish(ObjectCache *oc,
+                            ioh_t ioh, block_t start, block_t length,
+                            block_t diskstart, 
+                            bufferlist& bl)
+{
+  ebofs_lock.Lock();
+  dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length
+            << ", at device block " << diskstart << endl;
+
+  // oc
+  if (oc->put() == 0) 
+    delete oc;
+  else
+    oc->rx_finish(ioh, start, length, bl);
+
+  // finish any partials?
+  //  note: these are partials that were re-written after a commit,
+  //        or for whom the OC was destroyed (eg truncated after a commit)
+  map<block_t, map<block_t, PartialWrite> >::iterator sp = partial_write.lower_bound(diskstart);
+  while (sp != partial_write.end()) {
+    if (sp->first >= diskstart+length) break;
+    assert(sp->first >= diskstart);
+
+    block_t pblock = sp->first;
+    map<block_t, PartialWrite> writes;
+    writes.swap( sp->second );
+
+    map<block_t, map<block_t, PartialWrite> >::iterator t = sp;
+    sp++;
+    partial_write.erase(t);
+
+    for (map<block_t, PartialWrite>::iterator p = writes.begin();
+         p != writes.end();
+         p++) {
+      dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
+                << " for epoch " << p->second.epoch
+        //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")"
+                << endl;
+      // this had better be a past epoch
+      //assert(p->epoch == epoch_modified - 1);  // ??
+      
+      // make the combined block
+      bufferlist combined;
+      bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+      combined.push_back( bp );
+      combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
+      BufferHead::apply_partial( combined, p->second.partial );
+
+      // write it!
+      dev.write( pblock, 1, combined,
+                 new C_OC_PartialTxFinish( this, p->second.epoch ),
+                 "finish_partials");
+    }
+  }
+
+  // shadow partials?
+  {
+    list<Context*> waiters;
+    map<block_t, set<BufferHead*> >::iterator sp = shadow_partials.lower_bound(diskstart);
+    while (sp != shadow_partials.end()) {
+      if (sp->first >= diskstart+length) break;
+      assert(sp->first >= diskstart);
+      
+      block_t pblock = sp->first;
+      set<BufferHead*> ls;
+      ls.swap( sp->second );
+      
+      map<block_t, set<BufferHead*> >::iterator t = sp;
+      sp++;
+      shadow_partials.erase(t);
+      
+      for (set<BufferHead*>::iterator p = ls.begin();
+	   p != ls.end();
+	   ++p) {
+	BufferHead *bh = *p;
+	dout(10) << "rx_finish applying shadow_partial for " << pblock
+		 << " to " << *bh << endl;
+	bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+	bh->data.clear();
+	bh->data.push_back( bp );
+	bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, 
+			 (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, 
+			 bl);
+	bh->apply_partial();
+	bh->set_state(BufferHead::STATE_CLEAN);
+	
+	// trigger waiters
+	for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+	     p != bh->waitfor_read.end();
+	     p++) {
+	  assert(p->first >= bh->start() && p->first < bh->end());
+	  waiters.splice(waiters.begin(), p->second);
+	}
+	bh->waitfor_read.clear();
+      }  
+    }
+
+    // kick waiters
+    finish_contexts(waiters);
+  }
+
+  // done.
+  ebofs_lock.Unlock();
+}
+
+void BufferCache::partial_tx_finish(version_t epoch)
+{
+  ebofs_lock.Lock();
+
+  dout(10) << "partial_tx_finish in epoch " << epoch << endl;
+
+  // update unflushed counter
+  assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0);
+  dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch);
+
+  ebofs_lock.Unlock();
+}
+
+
+
+
+void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh)
+{
+  assert(bh->get_version() > 0);
+
+  assert(bh->is_partial());
+  assert(bh->length() == 1);
+  
+  // get the block no
+  vector<Extent> exv;
+  on->map_extents(bh->start(), bh->length(), exv);
+  assert(exv.size() == 1);
+  block_t b = exv[0].start;
+  assert(exv[0].length == 1);
+  bh->partial_tx_to = exv[0].start;
+  bh->partial_tx_epoch = bh->epoch_modified;
+
+  dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl;
+
+
+  // copy map state, queue for this block
+  assert(bh->rx_from.length == 1);
+  queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch );
+}
+
+void BufferCache::bh_cancel_partial_write(BufferHead *bh)
+{
+  assert(bh->is_partial());
+  assert(bh->length() == 1);
+
+  cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
+}
+
+
+void BufferCache::queue_partial(block_t from, block_t to, 
+                                map<off_t, bufferlist>& partial, version_t epoch)
+{
+  dout(10) << "queue_partial " << from << " -> " << to
+           << " in epoch " << epoch 
+           << endl;
+  
+  if (partial_write[from].count(to)) {
+    // this should be in the same epoch.
+    assert( partial_write[from][to].epoch == epoch);
+    assert(0); // actually.. no!
+  } else {
+    inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
+  }
+  
+  partial_write[from][to].partial = partial;
+  partial_write[from][to].epoch = epoch;
+}
+
+void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch)
+{
+  assert(partial_write.count(from));
+  assert(partial_write[from].count(to));
+  assert(partial_write[from][to].epoch == epoch);
+
+  dout(10) << "cancel_partial " << from << " -> " << to 
+           << "  (was epoch " << partial_write[from][to].epoch << ")"
+           << endl;
+
+  partial_write[from].erase(to);
+  if (partial_write[from].empty())
+    partial_write.erase(from);
+
+  dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
+}
+
+
+void BufferCache::add_shadow_partial(block_t from, BufferHead *bh)
+{
+  dout(10) << "add_shadow_partial from " << from << " " << *bh << endl;
+  shadow_partials[from].insert(bh);
+}
+
+void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh)
+{
+  dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl;
+  shadow_partials[from].erase(bh);
+}
diff --git a/branches/sage/cephmds2/ebofs/BufferCache.h b/branches/sage/cephmds2/ebofs/BufferCache.h
new file mode 100644
index 0000000000000..922c5e531ee56
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/BufferCache.h
@@ -0,0 +1,681 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_BUFFERCACHE_H
+#define __EBOFS_BUFFERCACHE_H
+
+#include "include/lru.h"
+#include "include/Context.h"
+
+#include "common/Clock.h"
+
+#include "types.h"
+#include "BlockDevice.h"
+
+#include "include/interval_set.h"
+
+class ObjectCache;
+class BufferCache;
+class Onode;
+
+class BufferHead : public LRUObject {
+ public:
+  /*
+   * - buffer_heads should always break across disk extent boundaries
+   * - partial buffer_heads are always 1 block.
+   */
+  const static int STATE_MISSING = 0; //     missing; data is on disk, but not loaded.
+  const static int STATE_CLEAN = 1;   // Rw  clean
+  const static int STATE_DIRTY = 2;   // RW  dirty
+  const static int STATE_TX = 3;      // Rw  flushing to disk
+  const static int STATE_RX = 4;      //  w  reading from disk
+  const static int STATE_PARTIAL = 5; // reading from disk, + partial content map.  always 1 block.
+
+ public:
+  ObjectCache *oc;
+
+  bufferlist data;
+
+  ioh_t     rx_ioh;         // 
+  Extent    rx_from;
+  ioh_t     tx_ioh;         // 
+  block_t   tx_block;
+  block_t   partial_tx_to;
+  version_t partial_tx_epoch;
+
+  map<off_t, bufferlist>     partial;   // partial dirty content overlayed onto incoming data
+
+  map< block_t, list<Context*> > waitfor_read;
+  
+  set<BufferHead*>  shadows;     // shadow bh's that clone()ed me.
+  BufferHead*       shadow_of;
+
+ private:
+  int        ref;
+  int        state;
+
+ public:
+  version_t  epoch_modified;
+  
+  version_t  version;        // current version in cache
+  version_t  last_flushed;   // last version flushed to disk
+ 
+  Extent     object_loc;     // block position _in_object_
+
+  utime_t    dirty_stamp;
+
+ public:
+  BufferHead(ObjectCache *o) :
+    oc(o), //cancellable_ioh(0), tx_epoch(0),
+    rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0),
+    shadow_of(0),
+    ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0)
+    {}
+  ~BufferHead() {
+    unpin_shadows();
+  }
+  
+  ObjectCache *get_oc() { return oc; }
+
+  int get() {
+    assert(ref >= 0);
+    if (ref == 0) lru_pin();
+    return ++ref;
+  }
+  int put() {
+    assert(ref > 0);
+    if (ref == 1) lru_unpin();
+    --ref;
+    return ref;
+  }
+
+  block_t start() { return object_loc.start; }
+  void set_start(block_t s) { object_loc.start = s; }
+  block_t length() { return object_loc.length; }
+  void set_length(block_t l) { object_loc.length = l; }
+  block_t end() { return start() + length(); }
+  block_t last() { return end()-1; }
+  
+  version_t get_version() { return version; }
+  void set_version(version_t v) { version = v; }
+  version_t get_last_flushed() { return last_flushed; }
+  void set_last_flushed(version_t v) { 
+    if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl;
+    assert(v > last_flushed);
+    last_flushed = v; 
+  }
+
+  utime_t get_dirty_stamp() { return dirty_stamp; }
+  void set_dirty_stamp(utime_t t) { dirty_stamp = t; }
+
+  void set_state(int s) {
+    if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get();
+    if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put();
+
+    if ((state == STATE_TX && s != STATE_TX) ||
+	(state == STATE_PARTIAL && s != STATE_PARTIAL)) 
+      unpin_shadows();
+
+    state = s;
+  }
+  int get_state() { return state; }
+
+  bool is_missing() { return state == STATE_MISSING; }
+  bool is_dirty() { return state == STATE_DIRTY; }
+  bool is_clean() { return state == STATE_CLEAN; }
+  bool is_tx() { return state == STATE_TX; }
+  bool is_rx() { return state == STATE_RX; }
+  bool is_partial() { return state == STATE_PARTIAL; }
+  
+  //bool is_partial_writes() { return !partial_write.empty(); }
+  //void finish_partials();
+  //void cancel_partials();
+  //void queue_partial_write(block_t b);
+
+  void add_shadow(BufferHead *dup) {
+    shadows.insert(dup);
+    dup->shadow_of = this;
+    dup->get();
+  }
+  void remove_shadow(BufferHead *dup) {
+    shadows.erase(dup);
+    dup->shadow_of = 0;
+    dup->put();
+  }
+  void unpin_shadows() {
+    for (set<BufferHead*>::iterator p = shadows.begin();
+	 p != shadows.end();
+	 ++p) {
+      //cout << "unpin shadow " << *p << endl;
+      (*p)->shadow_of = 0;
+      (*p)->put();
+    }
+    shadows.clear();
+  }
+
+  void copy_partial_substr(off_t start, off_t end, bufferlist& bl) {
+    map<off_t, bufferlist>::iterator i = partial.begin();
+    
+    // skip first bits (fully to left)
+    while ((i->first + i->second.length() < start) &&
+           i != partial.end()) 
+      i++;
+    assert(i != partial.end());
+    assert(i->first <= start);
+    
+    // first
+    unsigned bhoff = MAX(start, i->first) - i->first;
+    unsigned bhlen = MIN(end-start, i->second.length());
+    bl.substr_of( i->second, bhoff, bhlen );
+
+    off_t pos = i->first + i->second.length();
+    
+    // have continuous to end?
+    for (i++; i != partial.end(); i++) {
+      if (pos >= end) break;
+      assert(pos == i->first);
+
+      pos = i->first + i->second.length();
+
+      if (pos <= end) {      // this whole frag
+        bl.append( i->second );
+      } else {            // partial end
+        unsigned bhlen = end-start-bl.length();
+        bufferlist frag;
+        frag.substr_of( i->second, 0, bhlen );
+        bl.claim_append(frag);
+        break;  // done.
+      }
+    }
+    
+    assert(pos >= end);
+    assert(bl.length() == (unsigned)(end-start));
+  }
+
+  bool have_partial_range(off_t start, off_t end) {
+    map<off_t, bufferlist>::iterator i = partial.begin();
+
+    // skip first bits (fully to left)
+    while ((i->first + i->second.length() < start) &&
+           i != partial.end()) 
+      i++;
+    if (i == partial.end()) return false;
+
+    // have start?
+    if (i->first > start) return false;
+    off_t pos = i->first + i->second.length();
+
+    // have continuous to end?
+    for (i++; i != partial.end(); i++) {
+      assert(pos <= i->first);
+      if (pos < i->first) return false;
+      assert(pos == i->first);
+      pos = i->first + i->second.length();
+      if (pos >= end) break;  // gone far enough
+    }
+
+    if (pos >= end) return true;
+    return false;
+  }
+
+  bool partial_is_complete(off_t size) {
+    return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) );
+    //(off_t)(start()*EBOFS_BLOCK_SIZE),
+    //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) );
+  }
+  void apply_partial() {
+    apply_partial(data, partial);
+    partial.clear();
+  }
+  static void apply_partial(bufferlist& bl, map<off_t, bufferlist>& pm) {
+    assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE);
+    //assert(partial_is_complete());
+    //cout << "apply_partial" << endl;
+    for (map<off_t, bufferlist>::iterator i = pm.begin();
+         i != pm.end();
+         i++) {
+      int pos = i->first;
+      //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl;
+      bl.copy_in(pos, i->second.length(), i->second);
+    }
+    pm.clear();
+  }
+  void add_partial(off_t off, bufferlist& p) {
+    unsigned len = p.length();
+    assert(len <= (unsigned)EBOFS_BLOCK_SIZE);
+    //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE));
+    //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE));
+    assert(off >= 0);
+    assert(off + len <= EBOFS_BLOCK_SIZE);
+
+    // trim any existing that overlaps
+    for (map<off_t, bufferlist>::iterator i = partial.begin();
+         i != partial.end();
+         ) {
+      if (i->first + i->second.length() <= off) {  // before
+        i++; 
+        continue; 
+      }
+      if (i->first >= off+len) break;   // past affected area.
+
+      // overlap all?
+      if (off <= i->first && i->first + i->second.length() <= off+len) {
+        // erase it and move on.
+        off_t dead = i->first;
+        i++;
+        partial.erase(dead);  
+        continue;
+      }
+      // overlap tail?
+      else if (i->first < off && off < i->first + i->second.length()) {
+        // shorten.
+        unsigned newlen = off - i->first;
+        bufferlist o;
+        o.claim( i->second );
+        i->second.substr_of(o, 0, newlen);
+        i++;
+        continue;
+      }
+      // overlap head?
+      else if (off < i->first && off+len < i->first + i->second.length()) {
+        // move.
+        off_t oldoff = i->first;
+        off_t newoff = off+len;
+        unsigned trim = newoff - oldoff;
+        partial[newoff].substr_of(i->second, trim, i->second.length()-trim);
+        i++;  // should be at newoff!
+        partial.erase( oldoff );
+        i++;
+        continue;
+      } else
+        assert(0);
+    }
+
+    // insert
+    partial[off] = p;
+  }
+
+
+};
+
+inline ostream& operator<<(ostream& out, BufferHead& bh)
+{
+  out << "bufferhead(" << bh.start() << "~" << bh.length();
+  out << " v" << bh.get_version() << "/" << bh.get_last_flushed();
+  if (bh.is_missing()) out << " missing";
+  if (bh.is_dirty()) out << " dirty";
+  if (bh.is_clean()) out << " clean";
+  if (bh.is_rx()) out << " rx";
+  if (bh.is_tx()) out << " tx";
+  if (bh.is_partial()) out << " partial";
+  //out << " " << bh.data.length();
+  out << " " << &bh;
+  out << ")";
+  return out;
+}
+
+
+class ObjectCache {
+ public:
+  object_t object_id;
+  Onode *on;
+  BufferCache *bc;
+
+ private:
+  map<block_t, BufferHead*>  data;
+  int ref;
+
+ public:
+  version_t write_count;
+
+
+ public:
+  ObjectCache(object_t o, Onode *_on, BufferCache *b) : 
+    object_id(o), on(_on), bc(b), ref(0),
+    write_count(0) { }
+  ~ObjectCache() {
+    assert(data.empty());
+    assert(ref == 0);
+  }
+
+  int get() { 
+    ++ref;
+    //cout << "oc.get " << object_id << " " << ref << endl;
+    return ref; 
+  }
+  int put() { 
+    assert(ref > 0); 
+    --ref;
+    //cout << "oc.put " << object_id << " " << ref << endl;
+    return ref; 
+  }
+  
+  object_t get_object_id() { return object_id; }
+
+  void add_bh(BufferHead *bh) {
+    // add to my map
+    assert(data.count(bh->start()) == 0);
+
+    if (0) {  // sanity check     FIXME DEBUG
+      //cout << "add_bh " << bh->start() << "~" << bh->length() << endl;
+      map<block_t,BufferHead*>::iterator p = data.lower_bound(bh->start());
+      if (p != data.end()) {
+        //cout << " after " << *p->second << endl;
+        //cout << " after starts at " << p->first << endl;
+        assert(p->first >= bh->end());
+      }
+      if (p != data.begin()) {
+        p--;
+        //cout << " before starts at " << p->second->start() 
+        //<< " and ends at " << p->second->end() << endl;
+        //cout << " before " << *p->second << endl;
+        assert(p->second->end() <= bh->start());
+      }
+    }
+
+    data[bh->start()] = bh;
+  }
+  void remove_bh(BufferHead *bh) {
+    assert(data.count(bh->start()));
+    data.erase(bh->start());
+  }
+  bool is_empty() { return data.empty(); }
+
+  int find_tx(block_t start, block_t len,
+              list<BufferHead*>& tx);
+
+  int map_read(block_t start, block_t len, 
+               map<block_t, BufferHead*>& hits,     // hits
+               map<block_t, BufferHead*>& missing,  // read these from disk
+               map<block_t, BufferHead*>& rx,       // wait for these to finish reading from disk
+               map<block_t, BufferHead*>& partial); // (maybe) wait for these to read from disk
+  
+  int map_write(block_t start, block_t len,
+                interval_set<block_t>& alloc,
+                map<block_t, BufferHead*>& hits,
+                version_t super_epoch);   // can write to these.
+
+  BufferHead *split(BufferHead *bh, block_t off);
+
+  /*int scan_versions(block_t start, block_t len,
+                    version_t& low, version_t& high);
+  */
+
+  void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl);
+  void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch);
+
+  void truncate(block_t blocks, version_t super_epoch);
+  //  void tear_down();
+
+  void clone_to(Onode *other);
+
+  void dump() {
+    for (map<block_t,BufferHead*>::iterator i = data.begin();
+         i != data.end();
+         i++)
+      cout << "dump: " << i->first << ": " << *i->second << endl;
+  }
+
+};
+
+
+
+class BufferCache {
+ public:
+  Mutex             &ebofs_lock;          // hack: this is a ref to global ebofs_lock
+  BlockDevice       &dev;
+
+  set<BufferHead*> dirty_bh;
+
+  LRU   lru_dirty, lru_rest;
+
+ private:
+  Cond  stat_cond;
+  Cond  flush_cond;
+  int   stat_waiter;
+
+  off_t stat_clean;
+  off_t stat_dirty;
+  off_t stat_rx;
+  off_t stat_tx;
+  off_t stat_partial;
+  off_t stat_missing;
+
+#define EBOFS_BC_FLUSH_BHWRITE 0
+#define EBOFS_BC_FLUSH_PARTIAL 1
+
+  map<version_t, int> epoch_unflushed[2];
+  
+  /* partial writes - incomplete blocks that can't be written until
+   *  their prior content is read and overlayed with the new data.
+   *
+   * we put partial block management here because objects may be deleted
+   * before the read completes, but the write may have been committed in a 
+   * prior epoch.
+   *
+   * we map: src block -> dest block -> PartialWrite
+   *
+   * really, at most there will only ever be two of these, for current+previous epochs.
+   */
+  class PartialWrite {
+  public:
+    map<off_t, bufferlist> partial;   // partial dirty content overlayed onto incoming data
+    version_t              epoch;
+  };
+
+  map<block_t, map<block_t, PartialWrite> > partial_write;  // queued writes w/ partial content
+  map<block_t, set<BufferHead*> >           shadow_partials;
+
+ public:
+  BufferCache(BlockDevice& d, Mutex& el) : 
+    ebofs_lock(el), dev(d), 
+    stat_waiter(0),
+    stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0)
+    {}
+
+
+  off_t get_size() {
+    return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial;
+  }
+  off_t get_trimmable() {
+    return stat_clean;
+  }
+
+
+  // bh's in cache
+  void add_bh(BufferHead *bh) {
+    bh->get_oc()->add_bh(bh);
+    if (bh->is_dirty()) {
+      lru_dirty.lru_insert_mid(bh);
+      dirty_bh.insert(bh);
+    } else
+      lru_rest.lru_insert_mid(bh);
+    stat_add(bh);
+  }
+  void touch(BufferHead *bh) {
+    if (bh->is_dirty()) {
+      lru_dirty.lru_touch(bh);
+    } else
+      lru_rest.lru_touch(bh);
+  }
+  void remove_bh(BufferHead *bh) {
+    bh->get_oc()->remove_bh(bh);
+    stat_sub(bh);
+    if (bh->is_dirty()) {
+      lru_dirty.lru_remove(bh);
+      dirty_bh.erase(bh);
+    } else
+      lru_rest.lru_remove(bh);
+  }
+
+  // stats
+  void stat_add(BufferHead *bh) {
+    switch (bh->get_state()) {
+    case BufferHead::STATE_MISSING: stat_missing += bh->length(); break;
+    case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break;
+    case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break;
+    case BufferHead::STATE_TX: stat_tx += bh->length(); break;
+    case BufferHead::STATE_RX: stat_rx += bh->length(); break;
+    case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break;
+    }
+    if (stat_waiter) stat_cond.Signal();
+  }
+  void stat_sub(BufferHead *bh) {
+    switch (bh->get_state()) {
+    case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break;
+    case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break;
+    case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break;
+    case BufferHead::STATE_TX: stat_tx -= bh->length(); break;
+    case BufferHead::STATE_RX: stat_rx -= bh->length(); break;
+    case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break;
+    }
+  }
+  off_t get_stat_tx() { return stat_tx; }
+  off_t get_stat_rx() { return stat_rx; }
+  off_t get_stat_dirty() { return stat_dirty; }
+  off_t get_stat_clean() { return stat_clean; }
+  off_t get_stat_partial() { return stat_partial; }
+
+  
+  map<version_t, int> &get_unflushed(int what) {
+    return epoch_unflushed[what];
+  }
+
+  int get_unflushed(int what, version_t epoch) {
+    return epoch_unflushed[what][epoch];
+  }
+  void inc_unflushed(int what, version_t epoch) {
+    epoch_unflushed[what][epoch]++;
+    //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
+  }
+  void dec_unflushed(int what, version_t epoch) {
+    epoch_unflushed[what][epoch]--;
+    //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
+    if (epoch_unflushed[what][epoch] == 0) 
+      flush_cond.Signal();
+  }
+
+  void waitfor_stat() {
+    stat_waiter++;
+    stat_cond.Wait(ebofs_lock);
+    stat_waiter--;
+  }
+  void waitfor_flush() {
+    flush_cond.Wait(ebofs_lock);
+  }
+
+
+  // bh state
+  void set_state(BufferHead *bh, int s) {
+    // move between lru lists?
+    if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) {
+      lru_rest.lru_remove(bh);
+      lru_dirty.lru_insert_top(bh);
+      dirty_bh.insert(bh);
+    }
+    if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) {
+      lru_dirty.lru_remove(bh);
+      lru_rest.lru_insert_mid(bh);
+      dirty_bh.erase(bh);
+    }
+
+    // set state
+    stat_sub(bh);
+    bh->set_state(s);
+    stat_add(bh);
+  }      
+
+  void copy_state(BufferHead *bh1, BufferHead *bh2) { 
+    set_state(bh2, bh1->get_state());
+  }
+  
+  void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); };
+  void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); };
+  void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); };
+  void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); };
+  void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); };
+  void mark_dirty(BufferHead *bh) { 
+    set_state(bh, BufferHead::STATE_DIRTY); 
+    bh->set_dirty_stamp(g_clock.now());
+  };
+
+
+  // io
+  void bh_read(Onode *on, BufferHead *bh, block_t from=0);
+  void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0);
+
+  bool bh_cancel_read(BufferHead *bh);
+  bool bh_cancel_write(BufferHead *bh, version_t cur_epoch);
+
+  void bh_queue_partial_write(Onode *on, BufferHead *bh);
+  void bh_cancel_partial_write(BufferHead *bh);
+
+  void queue_partial(block_t from, block_t to, map<off_t, bufferlist>& partial, version_t epoch);
+  void cancel_partial(block_t from, block_t to, version_t epoch);
+
+  void add_shadow_partial(block_t from, BufferHead *bh);
+  void cancel_shadow_partial(block_t from, BufferHead *bh);
+
+  void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl);
+  void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e);
+  void partial_tx_finish(version_t epoch);
+
+  friend class C_E_FlushPartial;
+
+  // bh fun
+  BufferHead *split(BufferHead *orig, block_t after);
+};
+
+
+class C_OC_RxFinish : public BlockDevice::callback {
+  Mutex &lock;
+  ObjectCache *oc;
+  block_t start, length;
+  block_t diskstart;
+public:
+  bufferlist bl;
+  C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) :
+    lock(m), oc(o), start(s), length(l), diskstart(ds) {}
+  void finish(ioh_t ioh, int r) {
+    oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl);
+  }
+};
+
+class C_OC_TxFinish : public BlockDevice::callback {
+  Mutex &lock;
+  ObjectCache *oc;
+  block_t start, length;
+  version_t version;
+  version_t epoch;
+ public:
+  C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) :
+    lock(m), oc(o), start(s), length(l), version(v), epoch(e) {}
+  void finish(ioh_t ioh, int r) {
+    oc->bc->tx_finish(oc, ioh, start, length, version, epoch);
+  }  
+};
+
+class C_OC_PartialTxFinish : public BlockDevice::callback {
+  BufferCache *bc;
+  version_t epoch;
+public:
+  C_OC_PartialTxFinish(BufferCache *b, version_t e) :
+    bc(b), epoch(e) {}
+  void finish(ioh_t ioh, int r) {
+    bc->partial_tx_finish(epoch);
+  }  
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/Cnode.h b/branches/sage/cephmds2/ebofs/Cnode.h
new file mode 100644
index 0000000000000..b906a6db24c57
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Cnode.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_CNODE_H
+#define __EBOFS_CNODE_H
+
+#include "Onode.h"
+
+/*
+ * collection node
+ *
+ * holds attribute metadata for collections.
+ * colletion membership is stored in b+tree tables, independent of tte cnode.
+ */
+
+class Cnode : public LRUObject
+{
+ private:
+  int ref;
+  bool dirty;
+
+ public:
+  coll_t coll_id;
+  Extent cnode_loc;
+
+  map<string,bufferptr> attr;
+
+ public:
+  Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) {
+    cnode_loc.length = 0;
+  }
+  ~Cnode() {
+  }
+
+  block_t get_cnode_id() { return cnode_loc.start; }
+  int get_cnode_len() { return cnode_loc.length; }
+
+  void get() {
+    if (ref == 0) lru_pin();
+    ref++;
+  }
+  void put() {
+    ref--;
+    if (ref == 0) lru_unpin();
+  }
+  int get_ref_count() { return ref; }
+
+  void mark_dirty() {
+    if (!dirty) {
+      dirty = true;
+      get();
+    }
+  }
+  void mark_clean() {
+    if (dirty) {
+      dirty = false;
+      put();
+    }
+  }
+  bool is_dirty() { return dirty; }
+
+
+  int get_attr_bytes() {
+    int s = 0;
+    for (map<string, bufferptr>::iterator i = attr.begin();
+         i != attr.end();
+         i++) {
+      s += i->first.length() + 1;
+      s += i->second.length() + sizeof(int);
+    }
+    return s;
+  }
+  
+  //
+  //???void clear();
+
+  
+};
+
+inline ostream& operator<<(ostream& out, Cnode& cn)
+{
+  out << "cnode(" << hex << cn.coll_id << dec;
+  if (cn.is_dirty()) out << " dirty";
+  //out << " " << &cn;
+  out << ")";
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/Ebofs.cc b/branches/sage/cephmds2/ebofs/Ebofs.cc
new file mode 100644
index 0000000000000..520a9c7a00e92
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Ebofs.cc
@@ -0,0 +1,3169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "Ebofs.h"
+
+#include <errno.h>
+#include <sys/vfs.h>
+
+// *******************
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")."
+#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")."
+
+char *nice_blocks(block_t b) 
+{
+  static char s[20];
+  float sz = b*4.0;
+  if (sz > (10 << 20)) 
+    sprintf(s,"%.1f GB", sz / (1024.0*1024.0));
+  else if (sz > (10 << 10)) 
+    sprintf(s,"%.1f MB", sz / (1024.0));
+  else 
+    sprintf(s,"%llu KB", b*4ULL);
+  return s;
+}
+
+int Ebofs::mount()
+{
+  ebofs_lock.Lock();
+  assert(!mounted);
+
+  int r = dev.open(&idle_kicker);
+  if (r < 0) {
+    ebofs_lock.Unlock();
+    return r;
+  }
+
+  dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+
+  // read super
+  bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+  bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+  dev.read(0, 1, bp1);
+  dev.read(1, 1, bp2);
+
+  struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str();
+  struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str();
+  dout(3) << "mount super @0 epoch " << sb1->epoch << endl;
+  dout(3) << "mount super @1 epoch " << sb2->epoch << endl;
+
+  // pick newest super
+  struct ebofs_super *sb = 0;
+  if (sb1->epoch > sb2->epoch)
+    sb = sb1;
+  else
+    sb = sb2;
+  super_epoch = sb->epoch;
+  dout(3) << "mount epoch " << super_epoch << endl;
+  assert(super_epoch == sb->epoch);
+
+  free_blocks = sb->free_blocks;
+  limbo_blocks = sb->limbo_blocks;
+
+  // init node pools
+  dout(3) << "mount nodepool" << endl;
+  nodepool.init( &sb->nodepool );
+  nodepool.read_usemap( dev, super_epoch );
+  nodepool.read_clean_nodes( dev );
+  
+  // open tables
+  dout(3) << "mount opening tables" << endl;
+  object_tab = new Table<object_t, Extent>( nodepool, sb->object_tab );
+  for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+    free_tab[i] = new Table<block_t, block_t>( nodepool, sb->free_tab[i] );
+  limbo_tab = new Table<block_t, block_t>( nodepool, sb->limbo_tab );
+  alloc_tab = new Table<block_t, pair<block_t,int> >( nodepool, sb->alloc_tab );
+  
+  collection_tab = new Table<coll_t, Extent>( nodepool, sb->collection_tab );
+  co_tab = new Table<coll_object_t, bool>( nodepool, sb->co_tab );
+
+  allocator.release_limbo();
+
+  dout(3) << "mount starting commit+finisher threads" << endl;
+  commit_thread.create();
+  finisher_thread.create();
+
+  dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+  mounted = true;
+
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+
+int Ebofs::mkfs()
+{
+  ebofs_lock.Lock();
+  assert(!mounted);
+
+  int r = dev.open();
+  if (r < 0) {
+    ebofs_lock.Unlock();
+    return r;
+  }
+
+  block_t num_blocks = dev.get_num_blocks();
+
+  free_blocks = 0;
+  limbo_blocks = 0;
+
+  // create first noderegion
+  Extent nr;
+  nr.start = 2;
+  nr.length = 20+ (num_blocks / 1000);
+  if (nr.length < 10) nr.length = 10;
+  nodepool.add_region(nr);
+  dout(10) << "mkfs: first node region at " << nr << endl;
+
+  // allocate two usemaps
+  block_t usemap_len = nodepool.get_usemap_len();
+  nodepool.usemap_even.start = nr.end();
+  nodepool.usemap_even.length = usemap_len;
+  nodepool.usemap_odd.start = nodepool.usemap_even.end();
+  nodepool.usemap_odd.length = usemap_len;
+  dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl;
+  dout(10) << "mkfs:  odd usemap at " << nodepool.usemap_odd << endl;
+
+  // init tables
+  struct ebofs_table empty;
+  empty.num_keys = 0;
+  empty.root = -1;
+  empty.depth = 0;
+  
+  object_tab = new Table<object_t, Extent>( nodepool, empty );
+  collection_tab = new Table<coll_t, Extent>( nodepool, empty );
+  
+  for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+    free_tab[i] = new Table<block_t,block_t>( nodepool, empty );
+  limbo_tab = new Table<block_t,block_t>( nodepool, empty );
+  alloc_tab = new Table<block_t,pair<block_t,int> >( nodepool, empty );
+  
+  co_tab = new Table<coll_object_t, bool>( nodepool, empty );
+
+  // add free space
+  Extent left;
+  left.start = nodepool.usemap_odd.end();
+  left.length = num_blocks - left.start;
+  dout(10) << "mkfs: free data blocks at " << left << endl;
+  allocator._release_into_limbo( left );
+  if (g_conf.ebofs_cloneable) {
+    allocator.alloc_inc(nr);
+    allocator.alloc_inc(nodepool.usemap_even);
+    allocator.alloc_inc(nodepool.usemap_odd);
+  }
+  allocator.commit_limbo();   // -> limbo_tab
+  allocator.release_limbo();  // -> free_tab
+
+  // write nodes, super, 2x
+  dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl;
+
+  nodepool.commit_start( dev, 0 );
+  nodepool.commit_wait();
+  bufferptr superbp0;
+  prepare_super(0, superbp0);
+  write_super(0, superbp0);
+  
+  nodepool.commit_start( dev, 1 );
+  nodepool.commit_wait();
+  bufferptr superbp1;
+  prepare_super(1, superbp1);
+  write_super(1, superbp1);
+  
+  // free memory
+  dout(10) << "mkfs: cleaning up" << endl;
+  close_tables();
+
+  dev.close();
+
+  dout(2) << "mkfs: " << dev.get_device_name() << " "  << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+void Ebofs::close_tables() 
+{
+  // close tables
+  delete object_tab;
+  for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+    delete free_tab[i];
+  delete limbo_tab;
+  delete alloc_tab;
+  delete collection_tab;
+  delete co_tab;
+
+  nodepool.close();
+}
+
+int Ebofs::umount()
+{
+  ebofs_lock.Lock();
+  
+  // mark unmounting
+  dout(1) << "umount start" << endl;
+  readonly = true;
+  unmounting = true;
+  
+  // kick commit thread
+  dout(5) << "umount stopping commit thread" << endl;
+  commit_cond.Signal();
+  ebofs_lock.Unlock();
+  commit_thread.join();
+  ebofs_lock.Lock();
+
+  // kick finisher thread
+  dout(5) << "umount stopping finisher thread" << endl;
+  finisher_lock.Lock();
+  finisher_stop = true;
+  finisher_cond.Signal();
+  finisher_lock.Unlock();
+
+  finisher_thread.join();
+
+  trim_bc(0);
+  trim_inodes(0);
+
+  for (hash_map<object_t,Onode*>::iterator i = onode_map.begin();
+       i != onode_map.end();
+       i++) {
+    dout(0) << "umount *** leftover: " << i->first << "   " << *(i->second) << endl;
+  }
+
+  // free memory
+  dout(5) << "umount cleaning up" << endl;
+  close_tables();
+  dev.close();
+  readonly = unmounting = mounted = false;
+
+  dout(1) << "umount done on " << dev.get_device_name() << endl;
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+
+
+void Ebofs::prepare_super(version_t epoch, bufferptr& bp)
+{
+  struct ebofs_super sb;
+  
+  dout(10) << "prepare_super v" << epoch << endl;
+
+  // fill in super
+  memset(&sb, 0, sizeof(sb));
+  sb.s_magic = EBOFS_MAGIC;
+  sb.epoch = epoch;
+  sb.num_blocks = dev.get_num_blocks();
+
+  sb.free_blocks = free_blocks;
+  sb.limbo_blocks = limbo_blocks;
+
+
+  // tables
+  sb.object_tab.num_keys = object_tab->get_num_keys();
+  sb.object_tab.root = object_tab->get_root();
+  sb.object_tab.depth = object_tab->get_depth();
+
+  for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++) {
+    sb.free_tab[i].num_keys = free_tab[i]->get_num_keys();
+    sb.free_tab[i].root = free_tab[i]->get_root();
+    sb.free_tab[i].depth = free_tab[i]->get_depth();
+  }
+  sb.limbo_tab.num_keys = limbo_tab->get_num_keys();
+  sb.limbo_tab.root = limbo_tab->get_root();
+  sb.limbo_tab.depth = limbo_tab->get_depth();
+
+  sb.alloc_tab.num_keys = alloc_tab->get_num_keys();
+  sb.alloc_tab.root = alloc_tab->get_root();
+  sb.alloc_tab.depth = alloc_tab->get_depth();
+
+  sb.collection_tab.num_keys = collection_tab->get_num_keys();
+  sb.collection_tab.root = collection_tab->get_root();
+  sb.collection_tab.depth = collection_tab->get_depth();
+
+  sb.co_tab.num_keys = co_tab->get_num_keys();
+  sb.co_tab.root = co_tab->get_root();
+  sb.co_tab.depth = co_tab->get_depth();
+
+  // pools
+  sb.nodepool.num_regions = nodepool.region_loc.size();
+  for (unsigned i=0; i<nodepool.region_loc.size(); i++) {
+    sb.nodepool.region_loc[i] = nodepool.region_loc[i];
+  }
+  sb.nodepool.node_usemap_even = nodepool.usemap_even;
+  sb.nodepool.node_usemap_odd = nodepool.usemap_odd;
+  
+  // put in a buffer
+  bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+  memcpy(bp.c_str(), (const char*)&sb, sizeof(sb));
+}
+
+void Ebofs::write_super(version_t epoch, bufferptr& bp)
+{
+  block_t bno = epoch & 1;
+  
+  dout(10) << "write_super v" << epoch << " to b" << bno << endl;
+
+  dev.write(bno, 1, bp, "write_super");
+}
+
+int Ebofs::commit_thread_entry()
+{  
+  ebofs_lock.Lock();
+  dout(10) << "commit_thread start" << endl;
+
+  assert(!commit_thread_started); // there can be only one
+  commit_thread_started = true;
+  sync_cond.Signal();
+
+  while (mounted) {
+    
+    // wait for kick, or timeout
+    if (g_conf.ebofs_commit_ms) {
+      if (g_conf.ebofs_idle_commit_ms > 0) {
+        // periodically check for idle block device
+        dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " 
+                 << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl;
+        long left = g_conf.ebofs_commit_ms;
+        while (left > 0) {
+          long next = MIN(left, g_conf.ebofs_idle_commit_ms);
+          if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT) 
+            break;   // we got kicked
+          if (dev.is_idle()) {
+            dout(20) << "commit_thread bdev is idle, early commit" << endl;
+            break;  // dev is idle
+          }
+          left -= next;
+          dout(20) << "commit_thread " << left << " ms left" << endl;
+
+          // hack hack
+          //if (!left) g_conf.debug_ebofs = 10;
+          // /hack hack
+        }
+      } else {
+        // normal wait+timeout
+        dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl;
+        commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));   
+      }
+
+    } else {
+      // DEBUG.. wait until kicked
+      dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl;
+      commit_cond.Wait(ebofs_lock);
+    }
+
+    if (unmounting) {
+      dout(10) << "commit_thread unmounting: final commit pass" << endl;
+      assert(readonly);
+      unmounting = false;
+      mounted = false;
+      dirty = true;
+    }
+    
+    if (!dirty && !limbo_blocks) {
+      dout(10) << "commit_thread not dirty" << endl;
+    }
+    else {
+      super_epoch++;
+      dirty = false;
+
+      dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl;
+      dout(2) << "commit_thread   data: " 
+              << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, "
+              << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() 
+              << "%) free in " << get_free_extents() 
+              << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() 
+              << "%) limbo in " << get_limbo_extents() 
+              << endl;
+      dout(2) << "commit_thread  nodes: " 
+              << 100*nodepool.num_used()/nodepool.num_total() << "% used, "
+              << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " 
+              << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " 
+              << nodepool.num_total() << " total." << endl;
+      dout(2) << "commit_thread    bc: " 
+              << "size " << bc.get_size() 
+              << ", trimmable " << bc.get_trimmable()
+              << ", max " << g_conf.ebofs_bc_size
+              << "; dirty " << bc.get_stat_dirty()
+              << ", tx " << bc.get_stat_tx()
+              << ", max dirty " << g_conf.ebofs_bc_max_dirty
+              << endl;
+      
+      
+      // (async) write onodes+condes  (do this first; it currently involves inode reallocation)
+      commit_inodes_start();
+      
+      allocator.commit_limbo();   // limbo -> limbo_tab
+      
+      // (async) write btree nodes
+      nodepool.commit_start( dev, super_epoch );
+      
+      // blockdev barrier (prioritize our writes!)
+      dout(30) << "commit_thread barrier.  flushing inodes " << inodes_flushing << endl;
+      dev.barrier();
+
+      // prepare super (before any changes get made!)
+      bufferptr superbp;
+      prepare_super(super_epoch, superbp);
+      
+      // wait for it all to flush (drops global lock)
+      commit_bc_wait(super_epoch-1);  
+      dout(30) << "commit_thread bc flushed" << endl;
+      commit_inodes_wait();
+      dout(30) << "commit_thread inodes flushed" << endl;
+      nodepool.commit_wait();
+      dout(30) << "commit_thread btree nodes flushed" << endl;
+
+      // ok, now (synchronously) write the prior super!
+      dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl;
+      ebofs_lock.Unlock();
+      write_super(super_epoch, superbp);    
+      ebofs_lock.Lock();
+      
+      dout(10) << "commit_thread wrote super" << endl;
+
+      // free limbo space now 
+      // (since we're done allocating things, 
+      //  AND we've flushed all previous epoch data)
+      allocator.release_limbo();   // limbo_tab -> free_tabs
+      
+      // do we need more node space?
+      if (nodepool.num_free() < nodepool.num_total() / 3) {
+        dout(2) << "commit_thread running low on node space, allocating more." << endl;
+        alloc_more_node_space();
+      }
+      
+      // kick waiters
+      dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl;
+      
+      finisher_lock.Lock();
+      finisher_queue.splice(finisher_queue.end(), commit_waiters[super_epoch-1]);
+      commit_waiters.erase(super_epoch-1);
+      finisher_cond.Signal();
+      finisher_lock.Unlock();
+
+      sync_cond.Signal();
+
+      dout(10) << "commit_thread commit finish" << endl;
+    }
+
+    // trim bc?
+    trim_bc();
+    trim_inodes();
+
+  }
+  
+  dout(10) << "commit_thread finish" << endl;
+  commit_thread_started = false;
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+
+void Ebofs::alloc_more_node_space()
+{
+  dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl;
+  
+  if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) {
+    int want = nodepool.num_total();
+
+    Extent ex;
+    allocator.allocate(ex, want, 2);
+    dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl;
+
+    Extent even, odd;
+    unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length);
+    allocator.allocate(even, ulen, 2);
+    allocator.allocate(odd, ulen, 2);
+    dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl;
+
+    if (even.length == ulen && odd.length == ulen) {
+      dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl;
+      allocator.release(nodepool.usemap_even);
+      allocator.release(nodepool.usemap_odd);
+      nodepool.add_region(ex);
+      nodepool.usemap_even = even;
+      nodepool.usemap_odd = odd;
+    } else {
+      dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl;
+      allocator.release(ex);
+      allocator.release(even);
+      allocator.release(odd);
+      //assert(0);
+    }
+  } else {
+    dout(1) << "alloc_more_node_space already have max node regions!" << endl;
+    assert(0);
+  }
+}
+
+
+void *Ebofs::finisher_thread_entry()
+{
+  finisher_lock.Lock();
+  dout(10) << "finisher_thread start" << endl;
+
+  while (!finisher_stop) {
+    while (!finisher_queue.empty()) {
+      list<Context*> ls;
+      ls.swap(finisher_queue);
+
+      finisher_lock.Unlock();
+
+      //ebofs_lock.Lock();            // um.. why lock this?  -sage
+      finish_contexts(ls, 0);
+      //ebofs_lock.Unlock();
+
+      finisher_lock.Lock();
+    }
+    if (finisher_stop) break;
+    
+    dout(30) << "finisher_thread sleeping" << endl;
+    finisher_cond.Wait(finisher_lock);
+  }
+
+  dout(10) << "finisher_thread start" << endl;
+  finisher_lock.Unlock();
+  return 0;
+}
+
+
+// *** onodes ***
+
+Onode* Ebofs::new_onode(object_t oid)
+{
+  Onode* on = new Onode(oid);
+
+  assert(onode_map.count(oid) == 0);
+  onode_map[oid] = on;
+  onode_lru.lru_insert_top(on);
+  
+  assert(object_tab->lookup(oid) < 0);
+  object_tab->insert( oid, on->onode_loc );  // even tho i'm not placed yet
+
+  on->get();
+  on->onode_loc.start = 0;
+  on->onode_loc.length = 0;
+
+  dirty_onode(on);
+
+  dout(7) << "new_onode " << *on << endl;
+  return on;
+}
+
+
+Onode* Ebofs::get_onode(object_t oid)
+{
+  while (1) {
+    // in cache?
+    if (onode_map.count(oid)) {
+      // yay
+      Onode *on = onode_map[oid];
+      on->get();
+      //cout << "get_onode " << *on << endl;
+      return on;   
+    }
+    
+    // on disk?
+    Extent onode_loc;
+    if (object_tab->lookup(oid, onode_loc) < 0) {
+      dout(10) << "onode lookup failed on " << oid << endl;
+      // object dne.
+      return 0;
+    }
+    
+    // already loading?
+    if (waitfor_onode.count(oid)) {
+      // yep, just wait.
+      Cond c;
+      waitfor_onode[oid].push_back(&c);
+      dout(10) << "get_onode " << oid << " already loading, waiting" << endl;
+      c.Wait(ebofs_lock);
+      continue;
+    }
+
+    dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl;
+
+    assert(waitfor_onode.count(oid) == 0);
+    waitfor_onode[oid].clear();  // this should be empty initially. 
+
+    // read it!
+    bufferlist bl;
+    bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) );
+
+    ebofs_lock.Unlock();
+    dev.read( onode_loc.start, onode_loc.length, bl );
+    ebofs_lock.Lock();
+    
+    // add onode
+    Onode *on = new Onode(oid);
+    onode_map[oid] = on;
+    onode_lru.lru_insert_top(on);
+    
+    // parse data block
+    struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str();
+    if (eo->object_id != oid) {
+      cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl;
+      cerr << " onode_loc is " << eo->onode_loc << endl;
+      cerr << " object_size " << eo->object_size << endl;
+      cerr << " object_blocks " << eo->object_blocks << endl;
+      cerr << " " << eo->num_collections << " coll + " 
+           << eo->num_attr << " attr + " 
+           << eo->num_extents << " extents" << endl;
+      assert(eo->object_id == oid);
+    }
+    on->readonly = eo->readonly;
+    on->onode_loc = eo->onode_loc;
+    on->object_size = eo->object_size;
+    on->object_blocks = eo->object_blocks;
+
+    // parse
+    char *p = bl.c_str() + sizeof(*eo);
+
+    // parse collection list
+    for (int i=0; i<eo->num_collections; i++) {
+      coll_t c = *((coll_t*)p);
+      p += sizeof(c);
+      on->collections.insert(c);
+    }
+
+    // parse attributes
+    for (int i=0; i<eo->num_attr; i++) {
+      string key = p;
+      p += key.length() + 1;
+      int len = *(int*)(p);
+      p += sizeof(len);
+      on->attr[key] = buffer::copy(p, len);
+      p += len;
+      dout(15) << "get_onode " << *on  << " attr " << key << " len " << len << endl;
+    }
+    
+    // parse extents
+    on->extent_map.clear();
+    block_t n = 0;
+    for (int i=0; i<eo->num_extents; i++) {
+      Extent ex = *((Extent*)p);
+      on->extent_map[n] = ex;
+      dout(15) << "get_onode " << *on  << " ex " << i << ": " << ex << endl;
+      n += ex.length;
+      p += sizeof(Extent);
+    }
+    assert(n == on->object_blocks);
+
+    // wake up other waiters
+    for (list<Cond*>::iterator i = waitfor_onode[oid].begin();
+         i != waitfor_onode[oid].end();
+         i++)
+      (*i)->Signal();
+    waitfor_onode.erase(oid);   // remove Cond list
+    
+    on->get();
+    //cout << "get_onode " << *on << " (loaded)" << endl;
+    return on;
+  }
+}
+
+
+class C_E_InodeFlush : public BlockDevice::callback {
+  Ebofs *ebofs;
+public:
+  C_E_InodeFlush(Ebofs *e) : ebofs(e) {}
+  void finish(ioh_t ioh, int r) {
+    ebofs->flush_inode_finish();
+  }
+};
+
+
+void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off)
+{
+  // onode
+  struct ebofs_onode eo;
+  eo.readonly = on->readonly;
+  eo.onode_loc = on->onode_loc;
+  eo.object_id = on->object_id;
+  eo.object_size = on->object_size;
+  eo.object_blocks = on->object_blocks;
+  eo.num_collections = on->collections.size();
+  eo.num_attr = on->attr.size();
+  eo.num_extents = on->extent_map.size();
+  bl.copy_in(off, sizeof(eo), (char*)&eo);
+  off += sizeof(eo);
+
+  // collections
+  for (set<coll_t>::iterator i = on->collections.begin();
+       i != on->collections.end();
+       i++) {
+    bl.copy_in(off, sizeof(*i), (char*)&(*i));
+    off += sizeof(*i);
+  }    
+  
+  // attr
+  for (map<string, bufferptr>::iterator i = on->attr.begin();
+       i != on->attr.end();
+       i++) {
+    bl.copy_in(off, i->first.length()+1, i->first.c_str());
+    off += i->first.length()+1;
+    int l = i->second.length();
+    bl.copy_in(off, sizeof(int), (char*)&l);
+    off += sizeof(int);
+    bl.copy_in(off, l, i->second.c_str());
+    off += l;
+    dout(15) << "write_onode " << *on  << " attr " << i->first << " len " << l << endl;
+  }
+  
+  // extents
+  for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+       i != on->extent_map.end();
+       i++) {
+    bl.copy_in(off, sizeof(Extent), (char*)&(i->second));
+    off += sizeof(Extent);
+    dout(15) << "write_onode " << *on  << " ex " << i->first << ": " << i->second << endl;
+  }
+}
+
+void Ebofs::write_onode(Onode *on)
+{
+  // buffer
+  unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes();
+  unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
+
+  bufferlist bl;
+  bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
+
+  // (always) relocate onode
+  if (1) {
+    if (on->onode_loc.length) 
+      allocator.release(on->onode_loc);
+    
+    block_t first = 0;
+    if (on->extent_map.size()) 
+      first = on->extent_map.begin()->second.start;
+    
+    allocator.allocate(on->onode_loc, blocks, first);
+    object_tab->remove( on->object_id );
+    object_tab->insert( on->object_id, on->onode_loc );
+    //object_tab->verify();
+  }
+
+  dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl;
+
+  unsigned off = 0;
+  encode_onode(on, bl, off);
+  assert(off == bytes);
+
+  // write
+  dev.write( on->onode_loc.start, on->onode_loc.length, bl, 
+             new C_E_InodeFlush(this), "write_onode" );
+}
+
+void Ebofs::remove_onode(Onode *on)
+{
+  dout(8) << "remove_onode " << *on << endl;
+
+  assert(on->get_ref_count() >= 1);  // caller
+
+  // tear down buffer cache
+  if (on->oc) {
+    on->oc->truncate(0, super_epoch);         // this will kick readers along the way.
+    on->close_oc();
+  }
+
+  // remove from onode map, mark dangling/deleted
+  onode_map.erase(on->object_id);
+  onode_lru.lru_remove(on);
+  on->deleted = true;
+  on->dangling = true;
+  
+  // remove from object table
+  //dout(0) << "remove_onode on " << *on << endl;
+  object_tab->remove(on->object_id);
+  
+  // free onode space
+  if (on->onode_loc.length)
+    allocator.release(on->onode_loc);
+  
+  // free data space
+  for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+       i != on->extent_map.end();
+       i++)
+    allocator.release(i->second);
+  on->extent_map.clear();
+
+  // remove from collections
+  for (set<coll_t>::iterator i = on->collections.begin();
+       i != on->collections.end();
+       i++) {
+    co_tab->remove(coll_object_t(*i,on->object_id));
+  }
+  on->collections.clear();
+
+  // dirty -> clean?
+  if (on->is_dirty()) {
+    on->mark_clean();         // this unpins *on
+    dirty_onodes.erase(on);
+  }
+
+  if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl;
+  put_onode(on);
+
+  dirty = true;
+}
+
+void Ebofs::put_onode(Onode *on)
+{
+  on->put();
+  //cout << "put_onode " << *on << endl;
+  
+  if (on->get_ref_count() == 0 && on->dangling) {
+    //cout << " *** hosing on " << *on << endl;
+    delete on;
+  }
+}
+
+void Ebofs::dirty_onode(Onode *on)
+{
+  if (!on->is_dirty()) {
+    on->mark_dirty();
+    dirty_onodes.insert(on);
+  }
+  dirty = true;
+}
+
+void Ebofs::trim_inodes(int max)
+{
+  unsigned omax = onode_lru.lru_get_max();
+  unsigned cmax = cnode_lru.lru_get_max();
+  if (max >= 0) omax = cmax = max;
+  dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " 
+            << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl;
+
+  // onodes
+  while (onode_lru.lru_get_size() > omax) {
+    // expire an item
+    Onode *on = (Onode*)onode_lru.lru_expire();
+    if (on == 0) break;  // nothing to expire
+    
+    // expire
+    dout(20) << "trim_inodes removing onode " << *on << endl;
+    onode_map.erase(on->object_id);
+    on->dangling = true;
+
+    if (on->get_ref_count() == 0) {
+      assert(on->oc == 0);   // an open oc pins the onode!
+      delete on;
+    } else {
+      dout(-20) << "trim_inodes   still active: " << *on << endl;
+      assert(0); // huh?
+    }
+  }
+
+
+  // cnodes
+  while (cnode_lru.lru_get_size() > cmax) {
+    // expire an item
+    Cnode *cn = (Cnode*)cnode_lru.lru_expire();
+    if (cn == 0) break;  // nothing to expire
+
+    // expire
+    dout(20) << "trim_inodes removing cnode " << *cn << endl;
+    cnode_map.erase(cn->coll_id);
+    
+    delete cn;
+  }
+
+  dout(10) << "trim_inodes finish " 
+           << onode_lru.lru_get_size() << " / " << omax << " onodes, " 
+           << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl;
+}
+
+
+
+// *** cnodes ****
+
+Cnode* Ebofs::new_cnode(coll_t cid)
+{
+  Cnode* cn = new Cnode(cid);
+
+  assert(cnode_map.count(cid) == 0);
+  cnode_map[cid] = cn;
+  cnode_lru.lru_insert_top(cn);
+  
+  assert(collection_tab->lookup(cid) < 0);
+  collection_tab->insert( cid, cn->cnode_loc );  // even tho i'm not placed yet
+  
+  cn->get();
+  cn->cnode_loc.start = 0;
+  cn->cnode_loc.length = 0;
+
+  dirty_cnode(cn);
+
+  return cn;
+}
+
+Cnode* Ebofs::get_cnode(coll_t cid)
+{
+  while (1) {
+    // in cache?
+    if (cnode_map.count(cid)) {
+      // yay
+      Cnode *cn = cnode_map[cid];
+      cn->get();
+      return cn;   
+    }
+    
+    // on disk?
+    Extent cnode_loc;
+    if (collection_tab->lookup(cid, cnode_loc) < 0) {
+      // object dne.
+      return 0;
+    }
+    
+    // already loading?
+    if (waitfor_cnode.count(cid)) {
+      // yep, just wait.
+      Cond c;
+      waitfor_cnode[cid].push_back(&c);
+      dout(10) << "get_cnode " << cid << " already loading, waiting" << endl;
+      c.Wait(ebofs_lock);
+      continue;
+    }
+
+    dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl;
+
+    assert(waitfor_cnode.count(cid) == 0);
+    waitfor_cnode[cid].clear();  // this should be empty initially. 
+
+    // read it!
+    bufferlist bl;
+    //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl );
+    bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) );
+
+    ebofs_lock.Unlock();
+    dev.read( cnode_loc.start, cnode_loc.length, bl );
+    ebofs_lock.Lock();
+
+    // parse data block
+    Cnode *cn = new Cnode(cid);
+
+    cnode_map[cid] = cn;
+    cnode_lru.lru_insert_top(cn);
+    
+    struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str();
+    cn->cnode_loc = ec->cnode_loc;
+    
+    // parse attributes
+    char *p = bl.c_str() + sizeof(*ec);
+    for (int i=0; i<ec->num_attr; i++) {
+      string key = p;
+      p += key.length() + 1;
+      int len = *(int*)(p);
+      p += sizeof(len);
+      cn->attr[key] = buffer::copy(p, len);
+      p += len;
+      dout(15) << "get_cnode " << *cn  << " attr " << key << " len " << len << endl;
+    }
+    
+    // wake up other waiters
+    for (list<Cond*>::iterator i = waitfor_cnode[cid].begin();
+         i != waitfor_cnode[cid].end();
+         i++)
+      (*i)->Signal();
+    waitfor_cnode.erase(cid);   // remove Cond list
+
+    cn->get();
+    return cn;
+  }
+}
+
+void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off)
+{
+  // cnode
+  struct ebofs_cnode ec;
+  ec.cnode_loc = cn->cnode_loc;
+  ec.coll_id = cn->coll_id;
+  ec.num_attr = cn->attr.size();
+  bl.copy_in(off, sizeof(ec), (char*)&ec);
+  off += sizeof(ec);
+  
+  // attr
+  for (map<string, bufferptr >::iterator i = cn->attr.begin();
+       i != cn->attr.end();
+       i++) {
+    bl.copy_in(off, i->first.length()+1, i->first.c_str());
+    off += i->first.length()+1;
+    int len = i->second.length();
+    bl.copy_in(off, sizeof(int), (char*)&len);
+    off += sizeof(int);
+    bl.copy_in(off, len, i->second.c_str());
+    off += len;
+
+    dout(15) << "write_cnode " << *cn  << " attr " << i->first << " len " << len << endl;
+  }
+}
+
+void Ebofs::write_cnode(Cnode *cn)
+{
+  // allocate buffer
+  unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes();
+  unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
+  
+  bufferlist bl;
+  //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
+  bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
+
+  // (always) relocate cnode!
+  if (1) {
+    if (cn->cnode_loc.length) 
+      allocator.release(cn->cnode_loc);
+    
+    allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD);
+    collection_tab->remove( cn->coll_id );
+    collection_tab->insert( cn->coll_id, cn->cnode_loc );
+  }
+  
+  dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl;
+
+  unsigned off = 0;
+  encode_cnode(cn, bl, off);
+  assert(off == bytes);
+
+  // write
+  dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, 
+             new C_E_InodeFlush(this), "write_cnode" );
+}
+
+void Ebofs::remove_cnode(Cnode *cn)
+{
+  dout(10) << "remove_cnode " << *cn << endl;
+
+  // remove from table
+  collection_tab->remove(cn->coll_id);
+
+  // free cnode space
+  if (cn->cnode_loc.length)
+    allocator.release(cn->cnode_loc);
+
+  // remove from dirty list?
+  if (cn->is_dirty())
+    dirty_cnodes.erase(cn);
+
+  // remove from map and lru
+  cnode_map.erase(cn->coll_id);
+  cnode_lru.lru_remove(cn);
+
+  // count down refs
+  cn->mark_clean();
+  cn->put();
+  assert(cn->get_ref_count() == 0);
+
+  // hose.
+  delete cn;
+
+  dirty = true;
+}
+
+void Ebofs::put_cnode(Cnode *cn)
+{
+  cn->put();
+}
+
+void Ebofs::dirty_cnode(Cnode *cn)
+{
+  if (!cn->is_dirty()) {
+    cn->mark_dirty();
+    dirty_cnodes.insert(cn);
+  }
+  dirty = true;
+}
+
+
+
+
+
+void Ebofs::flush_inode_finish()
+{
+  ebofs_lock.Lock();
+  {
+    inodes_flushing--;
+    if (inodes_flushing < 1000)
+    dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl;
+    if (inodes_flushing == 0) 
+      inode_commit_cond.Signal();
+  }
+  ebofs_lock.Unlock();
+}
+
+void Ebofs::commit_inodes_start() 
+{
+  dout(10) << "commit_inodes_start" << endl;
+
+  assert(inodes_flushing == 0);
+
+  // onodes
+  for (set<Onode*>::iterator i = dirty_onodes.begin();
+       i != dirty_onodes.end();
+       i++) {
+    Onode *on = *i;
+    inodes_flushing++;
+    write_onode(on);
+    on->mark_clean();
+    on->uncommitted.clear();     // commit allocated blocks
+    on->commit_waiters.clear();  // these guys are gonna get taken care of, bc we committed.
+  }
+  dirty_onodes.clear();
+
+  // cnodes
+  for (set<Cnode*>::iterator i = dirty_cnodes.begin();
+       i != dirty_cnodes.end();
+       i++) {
+    Cnode *cn = *i;
+    inodes_flushing++;
+    write_cnode(cn);
+    cn->mark_clean();
+  }
+  dirty_cnodes.clear();
+
+  dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl;
+}
+
+void Ebofs::commit_inodes_wait()
+{
+  // caller must hold ebofs_lock
+  while (inodes_flushing > 0) {
+    dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl;
+    inode_commit_cond.Wait(ebofs_lock);
+  }
+  dout(10) << "commit_inodes_wait all flushed" << endl;
+}
+
+
+
+
+
+
+
+// *** buffer cache ***
+
+void Ebofs::trim_buffer_cache()
+{
+  ebofs_lock.Lock();
+  trim_bc(0);
+  ebofs_lock.Unlock();
+}
+
+void Ebofs::trim_bc(off_t max)
+{
+  if (max < 0)
+    max = g_conf.ebofs_bc_size;
+  dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl;
+
+  while (bc.get_size() > max &&
+         bc.get_trimmable()) {
+    BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire();
+    if (!bh) break;
+    
+    dout(25) << "trim_bc trimming " << *bh << endl;
+    assert(bh->is_clean());
+    
+    ObjectCache *oc = bh->oc;
+    bc.remove_bh(bh);
+    delete bh;
+    
+    if (oc->is_empty()) {
+      Onode *on = oc->on;
+      dout(10) << "trim_bc  closing oc on " << *on << endl;
+      on->close_oc();
+    }
+  }
+
+  dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl;
+}
+
+
+void Ebofs::kick_idle()
+{
+  dout(10) << "kick_idle" << endl;
+  commit_cond.Signal();
+
+  /*
+  ebofs_lock.Lock();
+  if (mounted && !unmounting && dirty) {
+    dout(0) << "kick_idle dirty, doing commit" << endl;
+    commit_cond.Signal();
+  } else {
+    dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl;
+  }
+  ebofs_lock.Unlock();
+  */
+}
+
+void Ebofs::sync(Context *onsafe)
+{
+  ebofs_lock.Lock();
+  if (onsafe) 
+    commit_waiters[super_epoch].push_back(onsafe);
+  ebofs_lock.Unlock();
+}
+
+void Ebofs::sync()
+{
+  ebofs_lock.Lock();
+  if (!dirty) {
+    dout(7) << "sync in " << super_epoch << ", not dirty" << endl;
+  } else {
+    dout(7) << "sync in " << super_epoch << endl;
+    
+    if (!commit_thread_started) {
+      dout(10) << "sync waiting for commit thread to start" << endl;
+      sync_cond.Wait(ebofs_lock);
+    }
+    
+    if (mid_commit) {
+      dout(10) << "sync waiting for commit in progress" << endl;
+      sync_cond.Wait(ebofs_lock);
+    }
+    
+    commit_cond.Signal();  // trigger a commit
+    
+    sync_cond.Wait(ebofs_lock);  // wait
+    
+    dout(10) << "sync finish in " << super_epoch << endl;
+  }
+  ebofs_lock.Unlock();
+}
+
+
+
+void Ebofs::commit_bc_wait(version_t epoch)
+{
+  dout(10) << "commit_bc_wait on epoch " << epoch << endl;  
+  
+  while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 ||
+         bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) {
+    //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl;
+    dout(10) << "commit_bc_wait epoch " << epoch
+              << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) 
+              << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) 
+              << endl;
+    bc.waitfor_flush();
+  }
+
+  bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch);
+  bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch);
+
+  dout(10) << "commit_bc_wait all flushed for epoch " << epoch
+            << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE)
+            << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL)
+            << endl;  
+}
+
+
+
+int Ebofs::statfs(struct statfs *buf)
+{
+  dout(7) << "statfs" << endl;
+
+  buf->f_type = EBOFS_MAGIC;             /* type of filesystem */
+  buf->f_bsize = 4096;                   /* optimal transfer block size */
+  buf->f_blocks = dev.get_num_blocks();  /* total data blocks in file system */
+  buf->f_bfree = get_free_blocks() 
+    + get_limbo_blocks();                /* free blocks in fs */
+  buf->f_bavail = get_free_blocks();     /* free blocks avail to non-superuser -- actually, for writing. */
+  buf->f_files = nodepool.num_total();   /* total file nodes in file system */
+  buf->f_ffree = nodepool.num_free();    /* free file nodes in fs */
+  //buf->f_fsid = 0;                       /* file system id */
+  buf->f_namelen = 8;                    /* maximum length of filenames */
+
+  return 0;
+}
+
+
+
+
+/*
+ * allocate a write to blocks on disk.
+ * - take care to not overwrite any "safe" data blocks.
+ *  - allocate/map new extents on disk as necessary
+ */
+void Ebofs::alloc_write(Onode *on, 
+                        block_t start, block_t len,
+                        interval_set<block_t>& alloc,
+                        block_t& old_bfirst, block_t& old_blast)
+{
+  // first decide what pages to (re)allocate 
+  alloc.insert(start, len);   // start with whole range
+
+  // figure out what bits are already uncommitted
+  interval_set<block_t> already_uncom;
+  already_uncom.intersection_of(alloc, on->uncommitted);
+
+  // subtract those off, so we're left with the committed bits (that must be reallocated).
+  alloc.subtract(already_uncom);
+  
+  dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl;
+  
+  // release it (into limbo)
+  for (map<block_t,block_t>::iterator i = alloc.m.begin();
+       i != alloc.m.end();
+       i++) {
+    // get old region
+    vector<Extent> old;
+    on->map_extents(i->first, i->second, old);
+    for (unsigned o=0; o<old.size(); o++) 
+      allocator.release(old[o]);
+
+    // take note if first/last blocks in write range are remapped.. in case we need to do a partial read/write thing
+    // these are for partial, so we don't care about TX bh's, so don't worry about bits canceling stuff below.
+    if (!old.empty()) {
+      if (i->first == start) {
+        old_bfirst = old[0].start;
+        dout(20) << "alloc_write  old_bfirst " << old_bfirst << " of " << old[0] << endl;
+      }
+      if (i->first+i->second == start+len) {
+        old_blast = old[old.size()-1].last();
+        dout(20) << "alloc_write  old_blast " << old_blast << " of " << old[old.size()-1] << endl;
+      }
+    }
+  }
+
+  // reallocate uncommitted too?
+  // ( --> yes.  we can always make better allocation decisions later, with more information. )
+  if (g_conf.ebofs_realloc) {
+    list<BufferHead*> tx;
+    
+    ObjectCache *oc = on->get_oc(&bc);
+    oc->find_tx(start, len, tx);
+    
+    for (list<BufferHead*>::reverse_iterator p = tx.rbegin();
+         p != tx.rend();
+         p++) {
+      BufferHead *bh = *p;
+
+      // cancelable/moveable?
+      if (alloc.contains(bh->start(), bh->length())) {
+        dout(10) << "alloc_write  " << *bh << " already in " << alloc << endl;
+        continue;
+      }
+
+      vector<Extent> old;
+      on->map_extents(bh->start(), bh->length(), old);
+      assert(old.size() == 1);
+
+      if (bh->start() >= start && bh->end() <= start+len) {
+        assert(bh->epoch_modified == super_epoch);
+        if (bc.bh_cancel_write(bh, super_epoch)) {
+          if (bh->length() == 1)
+          dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl;
+	  // no, this isn't compatible with clone() and extent reference counting.
+          //allocator.unallocate(old[0]);  // release (into free)
+	  allocator.release(old[0]);  
+          alloc.insert(bh->start(), bh->length());
+        } else {
+          if (bh->length() == 1)
+          dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl;
+          allocator.release(old[0]);     // release (into limbo)
+          alloc.insert(bh->start(), bh->length());
+        }
+      } else {
+        if (bh->length() == 1)
+        dout(10) << "alloc_write  skipped tx " << old[0] << ", not entirely within " 
+                 << start << "~" << len 
+                 << " bh " << *bh << endl;
+      }
+    }
+    
+    dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl;
+  }
+
+  if (alloc.empty()) return;  // no need to dirty the onode below!
+  
+
+  // merge alloc into onode uncommitted map
+  //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl;
+  interval_set<block_t> old = on->uncommitted;
+  on->uncommitted.union_of(alloc);
+  
+  dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl;
+
+  if (0) {
+    // verify
+    interval_set<block_t> ta;
+    ta.intersection_of(on->uncommitted, alloc);
+    cout << " ta " << ta << endl;
+    assert(alloc == ta);
+
+    interval_set<block_t> tb;
+    tb.intersection_of(on->uncommitted, old);
+    cout << " tb " << tb << endl;
+    assert(old == tb);
+  }
+
+  dirty_onode(on);
+
+  // allocate the space
+  for (map<block_t,block_t>::iterator i = alloc.m.begin();
+       i != alloc.m.end();
+       i++) {
+    dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl;
+
+    // allocate new space
+    block_t left = i->second;
+    block_t cur = i->first;
+    while (left > 0) {
+      Extent ex;
+      allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD);
+      dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl;
+      on->set_extent(cur, ex);      // map object to new region
+      left -= ex.length;
+      cur += ex.length;
+    }
+  }
+}
+
+
+
+
+void Ebofs::apply_write(Onode *on, off_t off, size_t len, bufferlist& bl)
+{
+  ObjectCache *oc = on->get_oc(&bc);
+
+  // map into blocks
+  off_t opos = off;         // byte pos in object
+  size_t zleft = 0;         // zeros left to write
+  size_t left = len;        // bytes left
+
+  block_t bstart = off / EBOFS_BLOCK_SIZE;
+
+  if (off > on->object_size) {
+    zleft = off - on->object_size;
+    opos = on->object_size;
+    bstart = on->object_size / EBOFS_BLOCK_SIZE;
+  }
+  if (off+(off_t)len > on->object_size) {
+    dout(10) << "apply_write extending size on " << *on << ": " << on->object_size 
+             << " -> " << off+len << endl;
+    on->object_size = off+len;
+    dirty_onode(on);
+  }
+  if (bl.length() == 0) {
+    zleft += len;
+    left = 0;
+  }
+  if (zleft)
+    dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl;
+
+  block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
+  block_t blen = blast-bstart+1;
+
+  // allocate write on disk.
+  interval_set<block_t> alloc;
+  block_t old_bfirst = 0;  // zero means not defined here (since we ultimately pass to bh_read)
+  block_t old_blast = 0; 
+  alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast);
+  dout(20) << "apply_write  old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl;
+
+  if (fake_writes) {
+    on->uncommitted.clear();   // worst case!
+    return;
+  }    
+
+  // map b range onto buffer_heads
+  map<block_t, BufferHead*> hits;
+  oc->map_write(bstart, blen, alloc, hits, super_epoch);
+  
+  // get current versions
+  //version_t lowv, highv;
+  //oc->scan_versions(bstart, blen, lowv, highv);
+  //highv++;
+  version_t highv = ++oc->write_count;
+  
+  // copy from bl into buffer cache
+  unsigned blpos = 0;       // byte pos in input buffer
+
+  // write data into buffers
+  for (map<block_t, BufferHead*>::iterator i = hits.begin();
+       i != hits.end(); 
+       i++) {
+    BufferHead *bh = i->second;
+    bh->set_version(highv);
+    bh->epoch_modified = super_epoch;
+    
+    // old write in progress?
+    if (bh->is_tx()) {      // copy the buffer to avoid munging up in-flight write
+      dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl;
+      bufferlist temp;
+      temp.claim(bh->data);
+      //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); 
+      bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+      bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
+    }
+
+    // need to split off partial?  (partials can only be ONE block)
+    if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) {
+      if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) {
+        BufferHead *right = bc.split(bh, bh->start()+1);
+        hits[right->start()] = right;
+        dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl;
+      }
+      if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) &&
+          ((off_t)len+off < on->object_size)) {
+        BufferHead *right = bc.split(bh, bh->last());
+        hits[right->start()] = right;
+        dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl;
+      }
+    }
+
+    // partial at head or tail?
+    if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) ||   // opos, not off, in case we're zeroing...
+        (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) {
+      // locate ourselves in bh
+      unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE;
+      assert(off_in_bh >= 0);
+      unsigned len_in_bh = MIN( (off_t)(zleft+left),
+                                (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos );
+      
+      if (bh->is_partial() || bh->is_rx() || bh->is_missing()) {
+        assert(bh->is_partial() || bh->is_rx() || bh->is_missing());
+        assert(bh->length() == 1);
+
+        // add frag to partial
+        dout(10) << "apply_write writing into partial " << *bh << ":"
+                 << " off_in_bh " << off_in_bh 
+                 << " len_in_bh " << len_in_bh
+                 << endl;
+        unsigned z = MIN( zleft, len_in_bh );
+        if (z) {
+	  bufferptr zp(z);
+	  zp.zero();
+          bufferlist zb;
+          zb.push_back(zp);
+          bh->add_partial(off_in_bh, zb);
+           zleft -= z;
+          opos += z;
+        }
+
+        bufferlist sb;
+        sb.substr_of(bl, blpos, len_in_bh-z);  // substr in existing buffer
+        bufferlist cp;
+        cp.append(sb.c_str(), len_in_bh-z);    // copy the partial bit!
+        bh->add_partial(off_in_bh, cp);
+        left -= len_in_bh-z;
+        blpos += len_in_bh-z;
+        opos += len_in_bh-z;
+
+        if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) {
+          dout(10) << "apply_write  completed partial " << *bh << endl;
+          //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);  // new buffers!
+	  bh->data.clear();
+	  bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+          bh->data.zero();
+          bh->apply_partial();
+          bc.mark_dirty(bh);
+          bc.bh_write(on, bh);
+        } 
+        else if (bh->is_rx()) {
+          dout(10) << "apply_write  rx -> partial " << *bh << endl;
+          assert(bh->length() == 1);
+          bc.mark_partial(bh);
+          bc.bh_queue_partial_write(on, bh);          // queue the eventual write
+        }
+        else if (bh->is_missing()) {
+          dout(10) << "apply_write  missing -> partial " << *bh << endl;
+          assert(bh->length() == 1);
+          bc.mark_partial(bh);
+
+          // take care to read from _old_ disk block locations!
+          if (bh->start() == bstart)
+            bc.bh_read(on, bh, old_bfirst);
+          else if (bh->start() == blast)
+            bc.bh_read(on, bh, old_blast);
+          else assert(0);
+
+          bc.bh_queue_partial_write(on, bh);          // queue the eventual write
+        }
+        else if (bh->is_partial()) {
+          dout(10) << "apply_write  already partial, no need to submit rx on " << *bh << endl;
+          if (bh->partial_tx_epoch == super_epoch)
+            bc.bh_cancel_partial_write(bh);
+          bc.bh_queue_partial_write(on, bh);          // queue the eventual write
+        }
+
+
+      } else {
+        assert(bh->is_clean() || bh->is_dirty() || bh->is_tx());
+        
+        // just write into the bh!
+        dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":"
+                 << " off_in_bh " << off_in_bh 
+                 << " len_in_bh " << len_in_bh
+                 << endl;
+
+        // copy data into new buffers first (copy on write!)
+        //  FIXME: only do the modified pages?  this might be a big bh!
+        bufferlist temp;
+        temp.claim(bh->data);
+        //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); 
+	bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+        bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
+
+        unsigned z = MIN( zleft, len_in_bh );
+        if (z) {
+	  bufferptr zp(z);
+	  zp.zero();
+          bufferlist zb;
+          zb.push_back(zp);
+          bh->data.copy_in(off_in_bh, z, zb);
+          zleft -= z;
+          opos += z;
+        }
+
+        bufferlist sub;
+        sub.substr_of(bl, blpos, len_in_bh-z);
+        bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub);
+        blpos += len_in_bh-z;
+        left -= len_in_bh-z;
+        opos += len_in_bh-z;
+
+        if (!bh->is_dirty())
+          bc.mark_dirty(bh);
+
+        bc.bh_write(on, bh);
+      }
+      continue;
+    }
+
+    // ok, we're talking full block(s) now (modulo last block of the object)
+    assert(opos % EBOFS_BLOCK_SIZE == 0);
+    assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) ||
+           opos+(off_t)(zleft+left) == on->object_size);
+
+    // alloc new buffers.
+    //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+    bh->data.clear();
+    bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+    
+    // copy!
+    unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left);
+    assert(len_in_bh <= zleft+left);
+    
+    dout(10) << "apply_write writing into " << *bh << ":"
+             << " len_in_bh " << len_in_bh
+             << endl;
+    
+    unsigned z = MIN(len_in_bh, zleft);
+    if (z) {
+      bufferptr zp(z);
+      zp.zero();
+      bufferlist zb;
+      zb.push_back(zp);
+      bh->data.copy_in(0, z, zb);
+      zleft -= z;
+    }
+    
+    bufferlist sub;
+    sub.substr_of(bl, blpos, len_in_bh-z);
+    bh->data.copy_in(z, len_in_bh-z, sub);
+
+    blpos += len_in_bh-z;
+    left -= len_in_bh-z;
+    opos += len_in_bh;
+
+    // old partial?
+    if (bh->is_partial() &&
+        bh->partial_tx_epoch == super_epoch) 
+      bc.bh_cancel_partial_write(bh);
+
+    // mark dirty
+    if (!bh->is_dirty())
+      bc.mark_dirty(bh);
+
+    bc.bh_write(on, bh);
+  }
+
+  assert(zleft == 0);
+  assert(left == 0);
+  assert(opos == off+(off_t)len);
+  //assert(blpos == bl.length());
+}
+
+
+
+
+// *** file i/o ***
+
+bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, 
+                         Cond *will_wait_on, bool *will_wait_on_bool)
+{
+  dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl;
+  ObjectCache *oc = on->get_oc(&bc);
+
+  // map
+  block_t bstart = off / EBOFS_BLOCK_SIZE;
+  block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
+  block_t blen = blast-bstart+1;
+
+  map<block_t, BufferHead*> hits;
+  map<block_t, BufferHead*> missing;  // read these
+  map<block_t, BufferHead*> rx;       // wait for these
+  map<block_t, BufferHead*> partials;  // ??
+  oc->map_read(bstart, blen, hits, missing, rx, partials);
+
+  // missing buffers?
+  if (!missing.empty()) {
+    for (map<block_t,BufferHead*>::iterator i = missing.begin();
+         i != missing.end();
+         i++) {
+      dout(10) << "attempt_read missing buffer " << *(i->second) << endl;
+      bc.bh_read(on, i->second);
+    }
+    BufferHead *wait_on = missing.begin()->second;
+    block_t b = MAX(wait_on->start(), bstart);
+    wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool));
+    return false;
+  }
+  
+  // are partials sufficient?
+  bool partials_ok = true;
+  for (map<block_t,BufferHead*>::iterator i = partials.begin();
+       i != partials.end();
+       i++) {
+    BufferHead *bh = i->second;
+    off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE);
+    off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE);
+    off_t start = MAX( off, bhstart );
+    off_t end = MIN( off+(off_t)len, bhend );
+    
+    if (!i->second->have_partial_range(start-bhstart, end-bhend)) {
+      if (partials_ok) {
+        // wait on this one
+        Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
+        dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl;
+        i->second->waitfor_read[i->second->start()].push_back(c);
+      }
+      partials_ok = false;
+    }
+  }
+  if (!partials_ok) return false;
+
+  // wait on rx?
+  if (!rx.empty()) {
+    BufferHead *wait_on = rx.begin()->second;
+    Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
+    dout(1) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl;
+    block_t b = MAX(wait_on->start(), bstart);
+    wait_on->waitfor_read[b].push_back(c);
+    return false;
+  }
+
+  // yay, we have it all!
+  // concurrently walk thru hits, partials.
+  map<block_t,BufferHead*>::iterator h = hits.begin();
+  map<block_t,BufferHead*>::iterator p = partials.begin();
+
+  bl.clear();
+  off_t pos = off;
+  block_t curblock = bstart;
+  while (curblock <= blast) {
+    BufferHead *bh = 0;
+    if (h->first == curblock) {
+      bh = h->second;
+      h++;
+    } else if (p->first == curblock) {
+      bh = p->second;
+      p++;
+    } else assert(0);
+    
+    off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE);
+    off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE);
+    off_t start = MAX( pos, bhstart );
+    off_t end = MIN( off+(off_t)len, bhend );
+
+    if (bh->is_partial()) {
+      // copy from a partial block.  yuck!
+      bufferlist frag;
+      bh->copy_partial_substr( start-bhstart, end-bhstart, frag );
+      bl.claim_append( frag );
+      pos += frag.length();
+    } else {
+      // copy from a full block.
+      if (bhstart == start && bhend == end) {
+        bl.append( bh->data );
+        pos += bh->data.length();
+      } else {
+        bufferlist frag;
+        dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl;
+        frag.substr_of(bh->data, start-bhstart, end-start);
+        pos += frag.length();
+        bl.claim_append( frag );
+      }
+    }
+
+    curblock = bh->end();
+    /* this assert is more trouble than it's worth
+    assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos ||   // should be aligned with next block
+           end != bhend ||                                // or we ended midway through bh
+           (bh->last() == blast && end == bhend));        // ended last block       ** FIXME WRONG???
+    */
+  }
+
+  assert(bl.length() == len);
+  return true;
+}
+
+int Ebofs::read(object_t oid, 
+                off_t off, size_t len,
+                bufferlist& bl)
+{
+  ebofs_lock.Lock();
+  int r = _read(oid, off, len, bl);
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl)
+{
+  dout(7) << "_read " << oid << " " << off << "~" << len << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) {
+    dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl;
+    return -ENOENT;  // object dne?
+  }
+
+  // read data into bl.  block as necessary.
+  Cond cond;
+
+  int r = 0;
+  while (1) {
+    // check size bound
+    if (off >= on->object_size) {
+      dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl;
+      r = -ESPIPE;   // FIXME better errno?
+      break;
+    }
+
+    size_t try_len = len ? len:on->object_size;
+    size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off;
+    
+    bool done;
+    if (attempt_read(on, off, will_read, bl, &cond, &done))
+      break;  // yay
+    
+    // wait
+    while (!done) 
+      cond.Wait(ebofs_lock);
+
+    if (on->deleted) {
+      dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl;
+      r = -ENOENT;
+      break;
+    }
+  }
+
+  put_onode(on);
+
+  trim_bc();
+
+  if (r < 0) return r;   // return error,
+  dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl;
+  return bl.length();    // or bytes read.
+}
+
+
+bool Ebofs::_write_will_block()
+{
+  return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty);
+}
+
+bool Ebofs::write_will_block()
+{
+  ebofs_lock.Lock();
+  bool b = _write_will_block();
+  ebofs_lock.Unlock();
+  return b;
+}
+
+
+unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl;
+
+  // do ops
+  unsigned r = 0;  // bit fields indicate which ops failed.
+  int bit = 1;
+  for (list<int>::iterator p = t.ops.begin();
+       p != t.ops.end();
+       p++) {
+    switch (*p) {
+    case Transaction::OP_READ:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        off_t offset = t.offsets.front(); t.offsets.pop_front();
+        size_t len = t.lengths.front(); t.lengths.pop_front();
+        bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+        if (_read(oid, offset, len, *pbl) < 0) {
+          dout(7) << "apply_transaction fail on _read" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_STAT:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        struct stat *st = t.psts.front(); t.psts.pop_front();
+        if (_stat(oid, st) < 0) {
+          dout(7) << "apply_transaction fail on _stat" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_GETATTR:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+        pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+        if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) {
+          dout(7) << "apply_transaction fail on _getattr" << endl;
+          r &= bit;
+        }        
+      }
+      break;
+
+    case Transaction::OP_GETATTRS:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+        if (_getattrs(oid, *pset) < 0) {
+          dout(7) << "apply_transaction fail on _getattrs" << endl;
+          r &= bit;
+        }        
+      }
+      break;
+
+
+    case Transaction::OP_WRITE:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        off_t offset = t.offsets.front(); t.offsets.pop_front();
+        size_t len = t.lengths.front(); t.lengths.pop_front();
+        bufferlist bl = t.bls.front(); t.bls.pop_front();
+        if (_write(oid, offset, len, bl) < 0) {
+          dout(7) << "apply_transaction fail on _write" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        off_t len = t.offsets.front(); t.offsets.pop_front();
+        if (_truncate(oid, len) < 0) {
+          dout(7) << "apply_transaction fail on _truncate" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        if (_remove(oid) < 0) {
+          dout(7) << "apply_transaction fail on _remove" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_SETATTR:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+        //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+        bufferlist bl;
+        bl.claim( t.attrbls.front() );
+        t.attrbls.pop_front();
+        if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) {
+          dout(7) << "apply_transaction fail on _setattr" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+        if (_setattrs(oid, *pattrset) < 0) {
+          dout(7) << "apply_transaction fail on _setattrs" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_RMATTR:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+        if (_rmattr(oid, attrname) < 0) {
+          dout(7) << "apply_transaction fail on _rmattr" << endl;
+          r &= bit;
+        }
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        object_t noid = t.oids.front(); t.oids.pop_front();
+	if (_clone(oid, noid) < 0) {
+	  dout(7) << "apply_transaction fail on _clone" << endl;
+	  r &= bit;
+	}
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        if (_create_collection(cid) < 0) {
+          dout(7) << "apply_transaction fail on _create_collection" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        if (_destroy_collection(cid) < 0) {
+          dout(7) << "apply_transaction fail on _destroy_collection" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_COLL_ADD:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        if (_collection_add(cid, oid) < 0) {
+          //dout(7) << "apply_transaction fail on _collection_add" << endl;
+          //r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_COLL_REMOVE:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        object_t oid = t.oids.front(); t.oids.pop_front();
+        if (_collection_remove(cid, oid) < 0) {
+          dout(7) << "apply_transaction fail on _collection_remove" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_COLL_SETATTR:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+        //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+        bufferlist bl;
+        bl.claim( t.attrbls.front() );
+        t.attrbls.pop_front();
+        if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) {
+          //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) {
+          dout(7) << "apply_transaction fail on _collection_setattr" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    case Transaction::OP_COLL_RMATTR:
+      {
+        coll_t cid = t.cids.front(); t.cids.pop_front();
+        const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+        if (_collection_rmattr(cid, attrname) < 0) {
+          dout(7) << "apply_transaction fail on _collection_rmattr" << endl;
+          r &= bit;
+        }
+      }
+      break;
+      
+    default:
+      cerr << "bad op " << *p << endl;
+      assert(0);
+    }
+
+    bit = bit << 1;
+  }
+  
+  dout(7) << "apply_transaction finish (r = " << r << ")" << endl;
+  
+  // set up commit waiter
+  //if (r == 0) {
+  if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  //} else {
+  //if (onsafe) delete onsafe;
+  //}
+  
+  ebofs_lock.Unlock();
+  return r;
+}
+
+
+
+int Ebofs::_write(object_t oid, off_t offset, size_t length, bufferlist& bl)
+{
+  dout(7) << "_write " << oid << " " << offset << "~" << length << endl;
+  assert(bl.length() == length);
+
+  // too much unflushed dirty data?  (if so, block!)
+  if (_write_will_block()) {
+    dout(10) << "_write blocking " 
+              << oid << " " << offset << "~" << length 
+              << "  bc: " 
+              << "size " << bc.get_size() 
+              << ", trimmable " << bc.get_trimmable()
+              << ", max " << g_conf.ebofs_bc_size
+              << "; dirty " << bc.get_stat_dirty()
+              << ", tx " << bc.get_stat_tx()
+              << ", max dirty " << g_conf.ebofs_bc_max_dirty
+              << endl;
+
+    while (_write_will_block()) 
+      bc.waitfor_stat();  // waits on ebofs_lock
+
+    dout(10) << "_write unblocked " 
+             << oid << " " << offset << "~" << length 
+              << "  bc: " 
+              << "size " << bc.get_size() 
+              << ", trimmable " << bc.get_trimmable()
+              << ", max " << g_conf.ebofs_bc_size
+              << "; dirty " << bc.get_stat_dirty()
+              << ", tx " << bc.get_stat_tx()
+              << ", max dirty " << g_conf.ebofs_bc_max_dirty
+              << endl;
+  }
+
+  // out of space?
+  unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10;  // very conservative; assumes we have to rewrite
+  max += dirty_onodes.size() + dirty_cnodes.size();
+  if (max >= free_blocks) {
+    dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl;
+    return -ENOSPC;
+  }
+  
+  // get|create inode
+  Onode *on = get_onode(oid);
+  if (!on) on = new_onode(oid);    // new inode!
+  if (on->readonly) {
+    put_onode(on);
+    return -EACCES;
+  }
+
+  dirty_onode(on);  // dirty onode!
+  
+  // apply write to buffer cache
+  if (length > 0)
+    apply_write(on, offset, length, bl);
+
+  // done.
+  put_onode(on);
+  trim_bc();
+
+  return length;
+}
+
+
+/*int Ebofs::write(object_t oid, 
+                 off_t off, size_t len,
+                 bufferlist& bl, bool fsync)
+{
+  // wait?
+  if (fsync) {
+    // wait for flush.
+    Cond cond;
+    bool done;
+    int flush = 1;    // write never returns positive
+    Context *c = new C_Cond(&cond, &done, &flush);
+    int r = write(oid, off, len, bl, c);
+    if (r < 0) return r;
+    
+    ebofs_lock.Lock();
+    {
+      while (!done) 
+        cond.Wait(ebofs_lock);
+      assert(flush <= 0);
+    }
+    ebofs_lock.Unlock();
+    if (flush < 0) return flush;
+    return r;
+  } else {
+    // don't wait for flush.
+    return write(oid, off, len, bl, (Context*)0);
+  }
+}
+*/
+
+int Ebofs::write(object_t oid, 
+                 off_t off, size_t len,
+                 bufferlist& bl, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  assert(len > 0);
+
+  // go
+  int r = _write(oid, off, len, bl);
+
+  // commit waiter
+  if (r > 0) {
+    assert((size_t)r == len);
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+
+int Ebofs::_remove(object_t oid)
+{
+  dout(7) << "_remove " << oid << endl;
+
+  // get inode
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+
+  // ok remove it!
+  remove_onode(on);
+
+  return 0;
+}
+
+
+int Ebofs::remove(object_t oid, Context *onsafe)
+{
+  ebofs_lock.Lock();
+
+  // do it
+  int r = _remove(oid);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_truncate(object_t oid, off_t size)
+{
+  dout(7) << "_truncate " << oid << " size " << size << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) 
+    return -ENOENT;
+  if (on->readonly) {
+    put_onode(on);
+    return -EACCES;
+  }
+
+  int r = 0;
+  if (size > on->object_size) {
+    r = -EINVAL;  // whatever
+  } 
+  else if (size < on->object_size) {
+    // change size
+    on->object_size = size;
+    dirty_onode(on);
+    
+    // free blocks
+    block_t nblocks = 0;
+    if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE;
+    if (on->object_blocks > nblocks) {
+      vector<Extent> extra;
+      on->truncate_extents(nblocks, extra);
+      for (unsigned i=0; i<extra.size(); i++)
+        allocator.release(extra[i]);
+    }
+
+    // truncate buffer cache
+    if (on->oc) {
+      on->oc->truncate(on->object_blocks, super_epoch);
+      if (on->oc->is_empty())
+	on->close_oc();
+    }
+
+    // update uncommitted
+    interval_set<block_t> uncom;
+    if (nblocks > 0) {
+      interval_set<block_t> left;
+      left.insert(0, nblocks);
+      uncom.intersection_of(left, on->uncommitted);
+    }
+    dout(10) << "uncommitted was " << on->uncommitted << "  now " << uncom << endl;
+    on->uncommitted = uncom;
+
+  }
+  else {
+    assert(size == on->object_size);
+  }
+
+  put_onode(on);
+  return r;
+}
+
+
+int Ebofs::truncate(object_t oid, off_t size, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  
+  int r = _truncate(oid, size);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+
+
+int Ebofs::clone(object_t from, object_t to, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  
+  int r = _clone(from, to);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_clone(object_t from, object_t to)
+{
+  dout(7) << "_clone " << from << " -> " << to << endl;
+
+  if (!g_conf.ebofs_cloneable) 
+    return -1;  // no!
+
+  Onode *fon = get_onode(from);
+  if (!fon) return -ENOENT;
+  Onode *ton = get_onode(to);
+  if (ton) {
+    put_onode(fon);
+    put_onode(ton);
+    return -EEXIST;
+  }
+  ton = new_onode(to); 
+  assert(ton);
+  
+  // copy easy bits
+  ton->readonly = true;
+  ton->object_size = fon->object_size;
+  ton->object_blocks = fon->object_blocks;
+  ton->attr = fon->attr;
+
+  // collections
+  for (set<coll_t>::iterator p = fon->collections.begin();
+       p != fon->collections.end();
+       p++)
+    _collection_add(*p, to);
+  
+  // extents
+  ton->extent_map = fon->extent_map;
+  for (map<block_t, Extent>::iterator p = ton->extent_map.begin();
+       p != ton->extent_map.end();
+       ++p) {
+    allocator.alloc_inc(p->second);
+  }
+
+  // clear uncommitted
+  fon->uncommitted.clear();
+
+  // muck with ObjectCache
+  if (fon->oc) 
+    fon->oc->clone_to( ton );
+  
+  // ok!
+  put_onode(ton);
+  put_onode(fon);
+  return 0;
+}
+
+
+
+
+/*
+ * pick object revision with rev < specified rev.  
+ *  (oid.rev is a noninclusive upper bound.)
+ *
+ */
+int Ebofs::pick_object_revision_lt(object_t& oid)
+{
+  assert(oid.rev > 0);   // this is only useful for non-zero oid.rev
+
+  int r = -EEXIST;             // return code
+  ebofs_lock.Lock();
+  {
+    object_t orig = oid;
+    object_t live = oid;
+    live.rev = 0;
+    
+    if (object_tab->get_num_keys() > 0) {
+      Table<object_t, Extent>::Cursor cursor(object_tab);
+      
+      object_tab->find(oid, cursor);  // this will be just _past_ highest eligible rev
+      if (cursor.move_left() > 0) {
+	bool firstpass = true;
+	while (1) {
+	  object_t t = cursor.current().key;
+	  if (t.ino != oid.ino || 
+	      t.bno != oid.bno)                 // passed to previous object
+	    break;
+	  if (oid.rev < t.rev) {                // rev < desired.  possible match.
+	    r = 0;
+	    oid = t;
+	    break;
+	  }
+	  if (firstpass && oid.rev >= t.rev) {  // there is no old rev < desired.  try live.
+	    r = 0;
+	    oid = live;
+	    break;
+	  }
+	  if (cursor.move_left() <= 0) break;
+	  firstpass = false;
+	}
+      }
+    }
+    
+    dout(8) << "find_object_revision " << orig << " -> " << oid
+	    << "  r=" << r << endl;
+  }
+  ebofs_lock.Unlock();
+  return r;
+}
+
+
+
+
+bool Ebofs::exists(object_t oid)
+{
+  ebofs_lock.Lock();
+  dout(8) << "exists " << oid << endl;
+  bool e = (object_tab->lookup(oid) == 0);
+  ebofs_lock.Unlock();
+  return e;
+}
+
+int Ebofs::stat(object_t oid, struct stat *st)
+{
+  ebofs_lock.Lock();
+  int r = _stat(oid,st);
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_stat(object_t oid, struct stat *st)
+{
+  dout(7) << "_stat " << oid << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  
+  // ??
+  st->st_size = on->object_size;
+
+  put_onode(on);
+  return 0;
+}
+
+
+int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) 
+{
+  dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  if (on->readonly) {
+    put_onode(on);
+    return -EACCES;
+  }
+
+  string n(name);
+  on->attr[n] = buffer::copy((char*)value, size);
+  dirty_onode(on);
+  put_onode(on);
+
+  dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl;
+
+  return 0;
+}
+
+int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  int r = _setattr(oid, name, value, size);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_setattrs(object_t oid, map<string,bufferptr>& attrset)
+{
+  dout(8) << "setattrs " << oid << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  if (on->readonly) {
+    put_onode(on);
+    return -EACCES;
+  }
+
+  on->attr = attrset;
+  dirty_onode(on);
+  put_onode(on);
+  return 0;
+}
+
+int Ebofs::setattrs(object_t oid, map<string,bufferptr>& attrset, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  int r = _setattrs(oid, attrset);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size)
+{
+  ebofs_lock.Lock();
+  int r = _getattr(oid, name, value, size);
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size)
+{
+  dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+
+  string n(name);
+  int r = 0;
+  if (on->attr.count(n) == 0) {
+    dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl;
+    r = -1;
+  } else {
+    r = MIN( on->attr[n].length(), size );
+    dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl;
+    memcpy(value, on->attr[n].c_str(), r );
+  }
+  put_onode(on);
+  return r;
+}
+
+int Ebofs::getattrs(object_t oid, map<string,bufferptr> &aset)
+{
+  ebofs_lock.Lock();
+  int r = _getattrs(oid, aset);
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_getattrs(object_t oid, map<string,bufferptr> &aset)
+{
+  dout(8) << "_getattrs " << oid << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  aset = on->attr;
+  put_onode(on);
+  return 0;
+}
+
+
+
+int Ebofs::_rmattr(object_t oid, const char *name) 
+{
+  dout(8) << "_rmattr " << oid << " '" << name << "'" << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  if (on->readonly) {
+    put_onode(on);
+    return -EACCES;
+  }
+
+  string n(name);
+  on->attr.erase(n);
+  dirty_onode(on);
+  put_onode(on);
+  return 0;
+}
+
+int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) 
+{
+  ebofs_lock.Lock();
+
+  int r = _rmattr(oid, name);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::listattr(object_t oid, vector<string>& attrs)
+{
+  ebofs_lock.Lock();
+  dout(8) << "listattr " << oid << endl;
+
+  Onode *on = get_onode(oid);
+  if (!on) {
+    ebofs_lock.Unlock();
+    return -ENOENT;
+  }
+
+  attrs.clear();
+  for (map<string,bufferptr>::iterator i = on->attr.begin();
+       i != on->attr.end();
+       i++) {
+    attrs.push_back(i->first);
+  }
+
+  put_onode(on);
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+
+
+/***************** collections ******************/
+
+int Ebofs::list_collections(list<coll_t>& ls)
+{
+  ebofs_lock.Lock();
+  dout(9) << "list_collections " << endl;
+
+  Table<coll_t, Extent>::Cursor cursor(collection_tab);
+
+  int num = 0;
+  if (collection_tab->find(0, cursor) >= 0) {
+    while (1) {
+      ls.push_back(cursor.current().key);
+      num++;
+      if (cursor.move_right() <= 0) break;
+    }
+  }
+
+  ebofs_lock.Unlock();
+  return num;
+}
+
+int Ebofs::_create_collection(coll_t cid)
+{
+  dout(9) << "_create_collection " << hex << cid << dec << endl;
+  
+  if (_collection_exists(cid)) 
+    return -EEXIST;
+
+  Cnode *cn = new_cnode(cid);
+  put_cnode(cn);
+  
+  return 0;  
+}
+
+int Ebofs::create_collection(coll_t cid, Context *onsafe)
+{
+  ebofs_lock.Lock();
+
+  int r = _create_collection(cid);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_destroy_collection(coll_t cid)
+{
+  dout(9) << "_destroy_collection " << hex << cid << dec << endl;
+
+  if (!_collection_exists(cid)) 
+    return -ENOENT;
+
+  Cnode *cn = get_cnode(cid);
+  assert(cn);
+
+  // hose mappings
+  list<object_t> objects;
+  collection_list(cid, objects);
+  for (list<object_t>::iterator i = objects.begin(); 
+       i != objects.end();
+       i++) {
+    co_tab->remove(coll_object_t(cid,*i));
+
+    Onode *on = get_onode(*i);
+    if (on) {
+      on->collections.erase(cid);
+      dirty_onode(on);
+      put_onode(on);
+    }
+  }
+
+  remove_cnode(cn);
+  return 0;
+}
+
+int Ebofs::destroy_collection(coll_t cid, Context *onsafe)
+{
+  ebofs_lock.Lock();
+
+  int r = _destroy_collection(cid);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return r;
+}
+
+bool Ebofs::collection_exists(coll_t cid)
+{
+  ebofs_lock.Lock();
+  dout(10) << "collection_exists " << hex << cid << dec << endl;
+  bool r = _collection_exists(cid);
+  ebofs_lock.Unlock();
+  return r;
+}
+bool Ebofs::_collection_exists(coll_t cid)
+{
+  return (collection_tab->lookup(cid) == 0);
+}
+
+int Ebofs::_collection_add(coll_t cid, object_t oid)
+{
+  dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl;
+
+  if (!_collection_exists(cid)) 
+    return -ENOENT;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+  
+  int r = 0;
+
+  if (on->collections.count(cid) == 0) {
+    on->collections.insert(cid);
+    dirty_onode(on);
+    co_tab->insert(coll_object_t(cid,oid), true);
+  } else {
+    r = -ENOENT;  // FIXME?  already in collection.
+  }
+  
+  put_onode(on);
+  return r;
+}
+
+int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe)
+{
+  ebofs_lock.Lock();
+
+  int r = _collection_add(cid, oid);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+int Ebofs::_collection_remove(coll_t cid, object_t oid)
+{
+  dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl;
+
+  if (!_collection_exists(cid)) 
+    return -ENOENT;
+
+  Onode *on = get_onode(oid);
+  if (!on) return -ENOENT;
+
+  int r = 0;
+
+  if (on->collections.count(cid)) {
+    on->collections.erase(cid);
+    dirty_onode(on);
+    co_tab->remove(coll_object_t(cid,oid));
+  } else {
+    r = -ENOENT;  // FIXME?
+  } 
+  
+  put_onode(on);
+  return r;
+}
+
+int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe)
+{
+  ebofs_lock.Lock();
+
+  int r = _collection_remove(cid, oid);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+int Ebofs::collection_list(coll_t cid, list<object_t>& ls)
+{
+  ebofs_lock.Lock();
+  dout(9) << "collection_list " << hex << cid << dec << endl;
+
+  if (!_collection_exists(cid)) {
+    ebofs_lock.Unlock();
+    return -ENOENT;
+  }
+  
+  Table<coll_object_t, bool>::Cursor cursor(co_tab);
+
+  int num = 0;
+  if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) {
+    while (1) {
+      const coll_t c = cursor.current().key.first;
+      const object_t o = cursor.current().key.second;
+      if (c != cid) break;   // end!
+      dout(10) << "collection_list  " << hex << cid << " includes " << o << dec << endl;
+      ls.push_back(o);
+      num++;
+      if (cursor.move_right() < 0) break;
+    }
+  }
+
+  ebofs_lock.Unlock();
+  return num;
+}
+
+
+int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size)
+{
+  dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl;
+
+  Cnode *cn = get_cnode(cid);
+  if (!cn) return -ENOENT;
+
+  string n(name);
+  cn->attr[n] = buffer::copy((char*)value, size);
+  dirty_cnode(cn);
+  put_cnode(cn);
+
+  return 0;
+}
+
+int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe)
+{
+  ebofs_lock.Lock();
+  dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl;
+
+  int r = _collection_setattr(cid, name, value, size);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size)
+{
+  ebofs_lock.Lock();
+  dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl;
+
+  Cnode *cn = get_cnode(cid);
+  if (!cn) {
+    ebofs_lock.Unlock();
+    return -ENOENT;
+  }
+
+  string n(name);
+  int r;
+  if (cn->attr.count(n) == 0) {
+    r = -1;
+  } else {
+    r = MIN( cn->attr[n].length(), size );
+    memcpy(value, cn->attr[n].c_str(), r);
+  }
+  
+  put_cnode(cn);
+  ebofs_lock.Unlock();
+  return r;
+}
+
+int Ebofs::_collection_rmattr(coll_t cid, const char *name) 
+{
+  dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl;
+
+  Cnode *cn = get_cnode(cid);
+  if (!cn) return -ENOENT;
+
+  string n(name);
+  cn->attr.erase(n);
+
+  dirty_cnode(cn);
+  put_cnode(cn);
+
+  return 0;
+}
+
+int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) 
+{
+  ebofs_lock.Lock();
+
+  int r = _collection_rmattr(cid, name);
+
+  // set up commit waiter
+  if (r >= 0) {
+    if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+  } else {
+    if (onsafe) delete onsafe;
+  }
+
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+int Ebofs::collection_listattr(coll_t cid, vector<string>& attrs)
+{
+  ebofs_lock.Lock();
+  dout(10) << "collection_listattr " << hex << cid << dec << endl;
+
+  Cnode *cn = get_cnode(cid);
+  if (!cn) {
+    ebofs_lock.Unlock();
+    return -ENOENT;
+  }
+
+  attrs.clear();
+  for (map<string,bufferptr>::iterator i = cn->attr.begin();
+       i != cn->attr.end();
+       i++) {
+    attrs.push_back(i->first);
+  }
+
+  put_cnode(cn);
+  ebofs_lock.Unlock();
+  return 0;
+}
+
+
+
+void Ebofs::_export_freelist(bufferlist& bl)
+{
+  for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+    Table<block_t,block_t> *tab;
+    if (b < EBOFS_NUM_FREE_BUCKETS) {
+      tab = free_tab[b];
+    } else {
+      tab = limbo_tab;
+    }
+    
+    if (tab->get_num_keys() > 0) {
+      Table<block_t,block_t>::Cursor cursor(tab);
+      assert(tab->find(0, cursor) >= 0);
+      while (1) {
+        assert(cursor.current().value > 0);
+        
+        Extent ex(cursor.current().key, cursor.current().value);
+        dout(10) << "_export_freelist " << ex << endl;
+        bl.append((char*)&ex, sizeof(ex));
+        if (cursor.move_right() <= 0) break;
+      }
+    }
+  }
+}
+
+void Ebofs::_import_freelist(bufferlist& bl)
+{
+  // clear
+  for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) 
+    free_tab[b]->clear();
+  limbo_tab->clear();
+
+  // import!
+  int num = bl.length() / sizeof(Extent);
+  Extent *p = (Extent*)bl.c_str();
+  for (int i=0; i<num; i++) {
+    dout(10) << "_import_freelist " << p[i] << endl;
+    allocator._release_loner(p[i]);
+  }
+}
+
+void Ebofs::_get_frag_stat(FragmentationStat& st)
+{
+  ebofs_lock.Lock();
+
+  // free list is easy
+  st.total = dev.get_num_blocks();
+  st.total_free = get_free_blocks() + get_limbo_blocks();
+  st.free_extent_dist.clear();
+  st.num_free_extent = 0;
+  st.avg_free_extent = 0;
+/*
+  __uint64_t tfree = 0;
+  for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+    Table<block_t,block_t> *tab;
+    if (b < EBOFS_NUM_FREE_BUCKETS) {
+      tab = free_tab[b];
+      dout(30) << "dump bucket " << b << "  " << tab->get_num_keys() << endl;
+    } else {
+      tab = limbo_tab;
+      dout(30) << "dump limbo  " << tab->get_num_keys() << endl;;
+    }
+    
+    if (tab->get_num_keys() > 0) {
+      Table<block_t,block_t>::Cursor cursor(tab);
+      assert(tab->find(0, cursor) >= 0);
+      while (1) {
+        assert(cursor.current().value > 0);
+        
+        block_t l = cursor.current().value;
+        tfree += l;
+        int b = 0;
+        do {
+          l = l >> 1;
+          b++; 
+        } while (l);
+        st.free_extent_dist[b]++;
+        st.free_extent_dist_sum[b] += cursor.current().value;
+        st.num_free_extent++;
+
+        if (cursor.move_right() <= 0) break;
+      }
+    }
+  }
+  st.avg_free_extent = tfree / st.num_free_extent;
+*/
+
+  // used extents is harder.  :(
+  st.num_extent = 0;
+  st.avg_extent = 0;
+  st.extent_dist.clear();
+  st.extent_dist_sum.clear();
+  st.avg_extent_per_object = 0;
+  st.avg_extent_jump = 0;
+
+  Table<object_t,Extent>::Cursor cursor(object_tab);
+  object_tab->find(object_t(), cursor);
+  int nobj = 0;
+  int njump = 0;
+  while (object_tab->get_num_keys() > 0) {
+    Onode *on = get_onode(cursor.current().key);
+    assert(on);
+
+    nobj++;    
+    st.avg_extent_per_object += on->extent_map.size();
+
+    for (map<block_t,Extent>::iterator p = on->extent_map.begin();
+         p != on->extent_map.end();
+         p++) {
+      block_t l = p->second.length;
+
+      st.num_extent++;
+      st.avg_extent += l;
+      if (p->first > 0) {
+        njump++;
+        st.avg_extent_jump += l;
+      }
+
+      int b = 0;
+      do {
+        l = l >> 1;
+        b++; 
+      } while (l);
+      st.extent_dist[b]++;
+      st.extent_dist_sum[b] += p->second.length;
+    }
+    put_onode(on);
+    if (cursor.move_right() <= 0) break;
+  }
+  if (njump) st.avg_extent_jump /= njump;
+  if (nobj) st.avg_extent_per_object /= (float)nobj;
+  if (st.num_extent) st.avg_extent /= st.num_extent;
+
+  ebofs_lock.Unlock();
+}
diff --git a/branches/sage/cephmds2/ebofs/Ebofs.h b/branches/sage/cephmds2/ebofs/Ebofs.h
new file mode 100644
index 0000000000000..a8efe3b6a6b4c
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Ebofs.h
@@ -0,0 +1,323 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "include/Context.h"
+#include "include/buffer.h"
+
+template<typename U,typename V>
+inline ostream& operator<<(ostream& out, const pair<U,V>& p) {
+  return out << p.first << "," << p.second;
+}
+
+#include "types.h"
+#include "Onode.h"
+#include "Cnode.h"
+#include "BlockDevice.h"
+#include "nodes.h"
+#include "Allocator.h"
+#include "Table.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+
+#include "osd/ObjectStore.h"
+
+//typedef pair<object_t,coll_t> object_coll_t;
+typedef pair<coll_t,object_t> coll_object_t;
+
+
+class Ebofs : public ObjectStore {
+ protected:
+  Mutex        ebofs_lock;    // a beautiful global lock
+
+  // ** debuggy **
+  bool         fake_writes;
+
+  // ** super **
+  BlockDevice  dev;
+  bool         mounted, unmounting, dirty;
+  bool         readonly;
+  version_t    super_epoch;
+  bool         commit_thread_started, mid_commit;
+  Cond         commit_cond;   // to wake up the commit thread
+  Cond         sync_cond;
+
+  map<version_t, list<Context*> > commit_waiters;
+
+  void prepare_super(version_t epoch, bufferptr& bp);
+  void write_super(version_t epoch, bufferptr& bp);
+  int commit_thread_entry();
+
+  class CommitThread : public Thread {
+    Ebofs *ebofs;
+  public:
+    CommitThread(Ebofs *e) : ebofs(e) {}
+    void *entry() {
+      ebofs->commit_thread_entry();
+      return 0;
+    }
+  } commit_thread;
+
+  
+
+
+  // ** allocator **
+  block_t      free_blocks, limbo_blocks;
+  Allocator    allocator;
+  friend class Allocator;
+  
+  block_t get_free_blocks() { return free_blocks; }
+  block_t get_limbo_blocks() { return limbo_blocks; }
+  block_t get_free_extents() { 
+    int n = 0;
+    for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++) 
+      n += free_tab[i]->get_num_keys();
+    return n;
+  }
+  block_t get_limbo_extents() { return limbo_tab->get_num_keys(); }
+
+
+  // ** tables and sets **
+  // nodes
+  NodePool     nodepool;   // for all tables...
+
+  // tables
+  Table<object_t, Extent> *object_tab;
+  Table<block_t,block_t>  *free_tab[EBOFS_NUM_FREE_BUCKETS];
+  Table<block_t,block_t>  *limbo_tab;
+  Table<block_t,pair<block_t,int> > *alloc_tab;
+
+  // collections
+  Table<coll_t, Extent>  *collection_tab;
+  Table<coll_object_t, bool>  *co_tab;
+
+  void close_tables();
+
+
+  // ** onodes **
+  hash_map<object_t, Onode*>  onode_map;  // onode cache
+  LRU                         onode_lru;
+  set<Onode*>                 dirty_onodes;
+  map<object_t, list<Cond*> > waitfor_onode;
+
+  Onode* new_onode(object_t oid);     // make new onode.  ref++.
+  Onode* get_onode(object_t oid);     // get cached onode, or read from disk.  ref++.
+  void remove_onode(Onode *on);
+  void put_onode(Onode* o);         // put it back down.  ref--.
+  void dirty_onode(Onode* o);
+  void encode_onode(Onode *on, bufferlist& bl, unsigned& off);
+  void write_onode(Onode *on);
+
+  // ** cnodes **
+  hash_map<coll_t, Cnode*>    cnode_map;
+  LRU                         cnode_lru;
+  set<Cnode*>                 dirty_cnodes;
+  map<coll_t, list<Cond*> >   waitfor_cnode;
+
+  Cnode* new_cnode(coll_t cid);
+  Cnode* get_cnode(coll_t cid);
+  void remove_cnode(Cnode *cn);
+  void put_cnode(Cnode *cn);
+  void dirty_cnode(Cnode *cn);
+  void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off);
+  void write_cnode(Cnode *cn);
+
+  // ** onodes+cnodes = inodes **
+  int                         inodes_flushing;
+  Cond                        inode_commit_cond;                    
+
+  void flush_inode_finish();
+  void commit_inodes_start();
+  void commit_inodes_wait();
+  friend class C_E_InodeFlush;
+
+  void trim_inodes(int max = -1);
+
+  // ** buffer cache **
+  BufferCache bc;
+  pthread_t flushd_thread_id;
+
+  version_t trigger_commit();
+  void commit_bc_wait(version_t epoch);
+  void trim_bc(off_t max = -1);
+
+ public:
+  void kick_idle();
+  void sync();
+  void sync(Context *onsafe);
+  void trim_buffer_cache();
+
+  class IdleKicker : public BlockDevice::kicker {
+    Ebofs *ebo;
+  public:
+    IdleKicker(Ebofs *t) : ebo(t) {}
+    void kick() { ebo->kick_idle(); }
+  } idle_kicker;
+
+
+ protected:
+  //void zero(Onode *on, size_t len, off_t off, off_t write_thru);
+  void alloc_write(Onode *on, 
+                   block_t start, block_t len, 
+                   interval_set<block_t>& alloc,
+                   block_t& old_bfirst, block_t& old_blast);
+  void apply_write(Onode *on, off_t off, size_t len, bufferlist& bl);
+  bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, 
+                    Cond *will_wait_on, bool *will_wait_on_bool);
+
+  // ** finisher **
+  // async write notification to users
+  Mutex          finisher_lock;
+  Cond           finisher_cond;
+  bool           finisher_stop;
+  list<Context*> finisher_queue;
+
+  void *finisher_thread_entry();
+  class FinisherThread : public Thread {
+    Ebofs *ebofs;
+  public:
+    FinisherThread(Ebofs *e) : ebofs(e) {}
+    void* entry() { return (void*)ebofs->finisher_thread_entry(); }
+  } finisher_thread;
+
+
+  void alloc_more_node_space();
+
+  void do_csetattrs(map<coll_t, map<const char*, pair<void*,int> > > &cmods);
+  void do_setattrs(Onode *on, map<const char*, pair<void*,int> > &setattrs);
+
+
+ public:
+  Ebofs(char *devfn) : 
+    fake_writes(false),
+    dev(devfn), 
+    mounted(false), unmounting(false), dirty(false), readonly(false), 
+    super_epoch(0), commit_thread_started(false), mid_commit(false),
+    commit_thread(this),
+    free_blocks(0), limbo_blocks(0),
+    allocator(this),
+    nodepool(ebofs_lock),
+    object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0),
+    onode_lru(g_conf.ebofs_oc_size),
+    cnode_lru(g_conf.ebofs_cc_size),
+    inodes_flushing(0),
+    bc(dev, ebofs_lock),
+    idle_kicker(this),
+    finisher_stop(false), finisher_thread(this) {
+    for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+      free_tab[i] = 0;
+  }
+  ~Ebofs() {
+  }
+
+  int mkfs();
+  int mount();
+  int umount();
+  
+  int statfs(struct statfs *buf);
+
+  // atomic transaction
+  unsigned apply_transaction(Transaction& t, Context *onsafe=0);
+
+  int pick_object_revision_lt(object_t& oid);
+
+  // object interface
+  bool exists(object_t);
+  int stat(object_t, struct stat*);
+  int read(object_t, off_t off, size_t len, bufferlist& bl);
+  //int write(object_t oid, off_t off, size_t len, bufferlist& bl, bool fsync=true);
+  int write(object_t oid, off_t off, size_t len, bufferlist& bl, Context *onsafe);
+  int truncate(object_t oid, off_t size, Context *onsafe=0);
+  int truncate_front(object_t oid, off_t size, Context *onsafe=0);
+  int remove(object_t oid, Context *onsafe=0);
+  bool write_will_block();
+
+  int rename(object_t from, object_t to);
+  int clone(object_t from, object_t to, Context *onsafe);
+  
+
+  // object attr
+  int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0);
+  int setattrs(object_t oid, map<string,bufferptr>& attrset, Context *onsafe=0);
+  int getattr(object_t oid, const char *name, void *value, size_t size);
+  int getattrs(object_t oid, map<string,bufferptr> &aset);
+  int rmattr(object_t oid, const char *name, Context *onsafe=0);
+  int listattr(object_t oid, vector<string>& attrs);
+
+  // collections
+  int list_collections(list<coll_t>& ls);
+  bool collection_exists(coll_t c);
+
+  int create_collection(coll_t c, Context *onsafe);
+  int destroy_collection(coll_t c, Context *onsafe);
+  int collection_add(coll_t c, object_t o, Context *onsafe);
+  int collection_remove(coll_t c, object_t o, Context *onsafe);
+
+  int collection_list(coll_t c, list<object_t>& o);
+  
+  int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe);
+  int collection_getattr(coll_t oid, const char *name, void *value, size_t size);
+  int collection_rmattr(coll_t cid, const char *name, Context *onsafe);
+  int collection_listattr(coll_t oid, vector<string>& attrs);
+  
+  // maps
+  int map_lookup(object_t o, bufferlist& key, bufferlist& val);
+  int map_insert(object_t o, bufferlist& key, bufferlist& val);
+  int map_remove(object_t o, bufferlist& key);
+  int map_list(object_t o, list<bufferlist>& keys);
+  int map_list(object_t o, map<bufferlist,bufferlist>& vals);
+  int map_list(object_t o, 
+	       bufferlist& start, bufferlist& end,
+	       map<bufferlist,bufferlist>& vals);
+
+  // crap
+  void _fake_writes(bool b) { fake_writes = b; }
+  void _get_frag_stat(FragmentationStat& st);
+
+  void _import_freelist(bufferlist& bl);
+  void _export_freelist(bufferlist& bl);
+
+
+private:
+  // private interface -- use if caller already holds lock
+  int _read(object_t oid, off_t off, size_t len, bufferlist& bl);
+  int _stat(object_t oid, struct stat *st);
+  int _getattr(object_t oid, const char *name, void *value, size_t size);
+  int _getattrs(object_t oid, map<string,bufferptr> &aset);
+
+  bool _write_will_block();
+  int _write(object_t oid, off_t off, size_t len, bufferlist& bl);
+  int _truncate(object_t oid, off_t size);
+  int _truncate_front(object_t oid, off_t size);
+  int _remove(object_t oid);
+  int _clone(object_t from, object_t to);
+  int _setattr(object_t oid, const char *name, const void *value, size_t size);
+  int _setattrs(object_t oid, map<string,bufferptr>& attrset);
+  int _rmattr(object_t oid, const char *name);
+  bool _collection_exists(coll_t c);
+  int _create_collection(coll_t c);
+  int _destroy_collection(coll_t c);
+  int _collection_add(coll_t c, object_t o);
+  int _collection_remove(coll_t c, object_t o);
+  int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size);
+  int _collection_rmattr(coll_t cid, const char *name);
+
+  
+};
diff --git a/branches/sage/cephmds2/ebofs/Onode.h b/branches/sage/cephmds2/ebofs/Onode.h
new file mode 100644
index 0000000000000..233c97e7ae172
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Onode.h
@@ -0,0 +1,390 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_ONODE_H
+#define __EBOFS_ONODE_H
+
+#include "include/lru.h"
+
+#include "types.h"
+#include "BufferCache.h"
+
+#include "include/interval_set.h"
+
+
+/*
+ * object node (like an inode)
+ *
+ * holds object metadata, including
+ *  size
+ *  allocation (extent list)
+ *  attributes
+ *
+ */
+
+class Onode : public LRUObject {
+private:
+  int ref;
+
+public:
+  object_t object_id;
+  version_t version;      // incremented on each modify.
+
+  // data
+  bool     readonly;
+  Extent   onode_loc;
+  off_t    object_size;
+  unsigned object_blocks;
+
+  // onode
+  set<coll_t>            collections;
+  map<string, bufferptr> attr;
+  //vector<Extent>        extents;
+  map<block_t, Extent>  extent_map;
+
+  interval_set<block_t> uncommitted;
+
+  ObjectCache  *oc;
+
+  bool          dirty;
+  bool          dangling;   // not in onode_map
+  bool          deleted;    // deleted
+
+  list<Context*>   commit_waiters;
+
+ public:
+  Onode(object_t oid) : ref(0), object_id(oid), version(0),
+			readonly(false),
+			object_size(0), object_blocks(0), oc(0),
+			dirty(false), dangling(false), deleted(false) { 
+    onode_loc.length = 0;
+  }
+  ~Onode() {
+    if (oc) delete oc;
+  }
+
+  block_t get_onode_id() { return onode_loc.start; }
+  int get_onode_len() { return onode_loc.length; }
+
+  int get_ref_count() { return ref; }
+  void get() {
+    if (ref == 0) lru_pin();
+    ref++;
+    //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl;
+  }
+  void put() {
+    ref--;
+    if (ref == 0) lru_unpin();
+    //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl;
+  }
+
+  void mark_dirty() {
+    if (!dirty) {
+      dirty = true;
+      get();
+    }
+  }
+  void mark_clean() {
+    if (dirty) {
+      dirty = false;
+      put();
+    }
+  }
+  bool is_dirty() { return dirty; }
+  bool is_deleted() { return deleted; }
+  bool is_dangling() { return dangling; }
+
+  
+  bool have_oc() {
+    return oc != 0;
+  }
+  ObjectCache *get_oc(BufferCache *bc) {
+    if (!oc) {
+      oc = new ObjectCache(object_id, this, bc);
+      oc->get();
+      get();
+    }
+    return oc;
+  }
+  void close_oc() {
+    if (oc) {
+      //cout << "close_oc on " << object_id << endl;
+      assert(oc->is_empty());
+      if (oc->put() == 0){
+        //cout << "************************* hosing oc" << endl;
+        delete oc;
+      }
+      oc = 0;
+      put();
+    }
+  }
+
+
+  // allocation
+  void verify_extents() {
+    if (0) {  // do crazy stupid sanity checking
+      block_t count = 0;
+      interval_set<block_t> is;    
+          
+      set<block_t> s;
+      cout << "verifying" << endl;
+
+      for (map<block_t,Extent>::iterator p = extent_map.begin();
+           p != extent_map.end();
+           p++) {
+        cout << " " << p->first << ": " << p->second << endl;
+        assert(count == p->first);
+        count += p->second.length;
+        for (unsigned j=0;j<p->second.length;j++) {
+          assert(s.count(p->second.start+j) == 0);
+          s.insert(p->second.start+j);
+        }
+      }
+
+      assert(s.size() == count);
+      assert(count == object_blocks);
+    }
+  }
+  void set_extent(block_t offset, Extent ex) {
+    //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl;
+    assert(offset <= object_blocks);
+    verify_extents();
+
+    // at the end?
+    if (offset == object_blocks) {
+      //cout << " appending " << ex << endl;
+      if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) {
+        //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl;
+        extent_map.rbegin()->second.length += ex.length;
+      } else
+        extent_map[object_blocks] = ex;
+      object_blocks += ex.length;
+      return;
+    }
+
+    // removing any extent bits we overwrite
+    if (!extent_map.empty()) {
+      // preceeding extent?
+      map<block_t,Extent>::iterator p = extent_map.lower_bound(offset);
+      if (p != extent_map.begin()) {
+        p--;
+        if (p->first + p->second.length > offset) {
+          //cout << " preceeding was " << p->second << endl;
+          if (p->first + p->second.length > offset+ex.length) {
+            // cutting chunk out of middle, add last bit
+            Extent &n = extent_map[offset+ex.length] = p->second;
+            n.start += offset+ex.length - p->first;
+            n.length -= offset+ex.length - p->first;
+            //cout << " tail frag is " << n << endl;
+          } 
+          p->second.length = offset - p->first;     // cut tail off preceeding extent
+          //cout << " preceeding now " << p->second << endl;
+        }
+        p++;
+      }      
+      
+      // overlapping extents
+      while (p != extent_map.end() &&
+             p->first < offset + ex.length) {
+        map<block_t,Extent>::iterator next = p;
+        next++;
+
+        // completely subsumed?
+        if (p->first + p->second.length <= offset+ex.length) {
+          //cout << " erasing " << p->second << endl;
+          extent_map.erase(p);
+          p = next;
+          continue;
+        }
+
+        // spans new extent, cut off head
+        Extent &n = extent_map[ offset+ex.length ] = p->second;
+        //cout << " cut head off " << p->second;
+        n.start += offset+ex.length - p->first;
+        n.length -= offset+ex.length - p->first;
+        extent_map.erase(p);
+        //cout << ", now " << n << endl;
+        break;
+      }
+    }
+
+    extent_map[ offset ] = ex;
+
+    // extend object?
+    if (offset + ex.length > object_blocks)
+      object_blocks = offset+ex.length;
+    
+    verify_extents();
+  }
+  
+
+  /* map_extents(start, len, ls)
+   *  map teh given page range into extents on disk.
+   */
+  int map_extents(block_t start, block_t len, vector<Extent>& ls) {
+    //cout << "map_extents " << start << " " << len << endl;
+    verify_extents();
+
+    //assert(start+len <= object_blocks);
+
+    map<block_t,Extent>::iterator p = extent_map.lower_bound(start);
+    if (p != extent_map.begin() &&
+        (p == extent_map.end() || p->first > start && p->first)) {
+      p--;
+      if (p->second.length > start - p->first) {
+        Extent ex;
+        ex.start = p->second.start + (start - p->first);
+        ex.length = MIN(len, p->second.length - (start - p->first));
+        ls.push_back(ex);
+        
+        //cout << " got (tail of?) " << p->second << " : " << ex << endl;
+        
+        start += ex.length;
+        len -= ex.length;
+      }
+      p++;
+    }
+
+    while (len > 0 &&
+           p != extent_map.end()) {
+      assert(p->first == start);
+      Extent ex = p->second;
+      ex.length = MIN(len, ex.length);
+      ls.push_back(ex);
+      //cout << " got (head of?) " << p->second << " : " << ex << endl;
+      start += ex.length;
+      len -= ex.length;
+      p++;
+    }    
+
+    return 0;
+  }
+
+  int truncate_extents(block_t len, vector<Extent>& extra) {
+    verify_extents();
+
+    map<block_t,Extent>::iterator p = extent_map.lower_bound(len);
+    if (p != extent_map.begin() &&
+        (p == extent_map.end() || p->first > len && p->first)) {
+      p--;
+      if (p->second.length > len - p->first) {
+        Extent ex;
+        ex.start = p->second.start + (len - p->first);
+        ex.length = p->second.length - (len - p->first);
+        extra.push_back(ex);
+
+        p->second.length = len - p->first;
+        assert(p->second.length > 0);
+        
+        //cout << " got (tail of?) " << p->second << " : " << ex << endl;
+      }
+      p++;
+    }
+    
+    while (p != extent_map.end()) {
+      assert(p->first >= len);
+      extra.push_back(p->second);
+      map<block_t,Extent>::iterator n = p;
+      n++;
+      extent_map.erase(p);
+      p = n;
+    }    
+    
+    object_blocks = len;
+    verify_extents();
+    return 0;
+  }
+
+  int truncate_front_extents(block_t len, vector<Extent>& extra) {
+    verify_extents();
+    
+    while (len > 0) {
+      Extent& ex = extent_map.begin()->second;  // look, this is a reference!
+      if (ex.length > len) {
+        // partial first extent
+        Extent frontbit( ex.start, len );
+        extra.push_back(frontbit);
+        ex.length -= len;
+        ex.start += len;
+        break;
+      }
+
+      // pull off entire first extent.
+      assert(ex.length <= len);
+      len -= ex.length;
+      extra.push_back(ex);
+      extent_map.erase(extent_map.begin());
+    }
+
+    object_blocks -= len;
+    verify_extents();
+    return 0;
+  }
+
+
+
+  /* map_alloc_regions(start, len, map)
+   *  map range into regions that need to be (re)allocated on disk
+   *  because they overlap "safe" (or unallocated) parts of the object
+   */
+  /*
+  void map_alloc_regions(block_t start, block_t len, 
+                         interval_set<block_t>& alloc) {
+    interval_set<block_t> already_uncom;
+
+    alloc.insert(start, len);   // start with whole range
+    already_uncom.intersection_of(alloc, uncommitted);
+    alloc.subtract(already_uncom);   // take out the bits that aren't yet committed
+  }
+  */
+
+
+
+  // pack/unpack
+  int get_collection_bytes() {
+    return sizeof(coll_t) * collections.size();
+  }
+  int get_attr_bytes() {
+    int s = 0;
+    for (map<string, bufferptr>::iterator i = attr.begin();
+         i != attr.end();
+         i++) {
+      s += i->first.length() + 1;
+      s += i->second.length() + sizeof(int);
+    }
+    return s;
+  }
+  int get_extent_bytes() {
+    return sizeof(Extent) * extent_map.size();
+  }
+
+};
+
+
+inline ostream& operator<<(ostream& out, Onode& on)
+{
+  out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size;
+  out << " ref=" << on.get_ref_count();
+  if (on.is_dirty()) out << " dirty";
+  if (on.is_dangling()) out << " dangling";
+  if (on.is_deleted()) out << " deleted";
+  out << " uncom=" << on.uncommitted;
+  //  out << " " << &on;
+  out << ")";
+  return out;
+}
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/Table.h b/branches/sage/cephmds2/ebofs/Table.h
new file mode 100644
index 0000000000000..e6b3fb39660e4
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/Table.h
@@ -0,0 +1,897 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_TABLE_H
+#define __EBOFS_TABLE_H
+
+#include "types.h"
+#include "nodes.h"
+
+/** table **/
+
+#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")."
+
+
+template<class K, class V>
+class Table {
+ private:
+  NodePool &pool;
+  
+  nodeid_t root;
+  int      nkeys;
+  int      depth;
+
+ public:
+  Table(NodePool &p,
+        struct ebofs_table& bts) : 
+    pool(p),
+    root(bts.root), nkeys(bts.num_keys), depth(bts.depth) {
+    dbtout << "cons" << endl;
+  }
+  
+  nodeid_t get_root() { return root; }
+  int get_num_keys() { return nkeys; }
+  int get_depth() { return depth; }
+
+
+  /*
+   */
+  class _IndexItem {     // i just need a struct size for below
+    K k;
+    nodeid_t n;
+  };
+  class IndexItem {
+  public:
+    K        key;
+    nodeid_t node;
+    static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem));
+    static const int MIN = MAX/2;
+  };
+  class _LeafItem {     // i just need a struct size for below
+    K k;
+    V v;
+  };
+  class LeafItem {
+  public:
+    K key;
+    V value;
+    static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem));
+    static const int MIN = MAX/2;
+  };
+  
+  class Nodeptr {
+  public:    
+    Node      *node;
+
+    Nodeptr() : node(0) {}
+    Nodeptr(Node *n) : node(n) {}
+    Nodeptr& operator=(Node *n) {
+      node = n;
+      return *this;
+    }
+    
+    LeafItem&  leaf_item(int i)  { return (( LeafItem*)(node->item_ptr()))[i]; }
+    IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; }
+    K key(int i) {
+      if (node->is_index()) 
+        return index_item(i).key;
+      else
+        return leaf_item(i).key;
+    }
+    
+    bool is_leaf() { return node->is_leaf(); }
+    bool is_index() { return node->is_index(); }
+    void set_type(int t) { node->set_type(t); }
+
+    int max_items() const {
+      if (node->is_leaf()) 
+        return LeafItem::MAX;
+      else
+        return IndexItem::MAX;
+    }
+    int min_items() const { return max_items() / 2; }
+    
+    nodeid_t get_id() { return node->get_id(); }
+
+    int size() { return node->size(); }
+    void set_size(int s) { node->set_size(s); }
+
+    void remove_at_pos(int p) {
+      if (node->is_index()) {
+        for (int i=p; i<size()-1; i++)
+          index_item(i) = index_item(i+1);
+      } else {
+        for (int i=p; i<size()-1; i++)
+          leaf_item(i) = leaf_item(i+1);
+      }
+      set_size(size() - 1);
+    }
+    void insert_at_leaf_pos(int p, K key, V value) {
+      assert(is_leaf());
+      for (int i=size(); i>p; i--)
+        leaf_item(i) = leaf_item(i-1);
+      leaf_item(p).key = key;
+      leaf_item(p).value = value;
+      set_size(size() + 1);
+    }
+    void insert_at_index_pos(int p, K key, nodeid_t node) {
+      assert(is_index());
+      for (int i=size(); i>p; i--)
+        index_item(i) = index_item(i-1);
+      index_item(p).key = key;
+      index_item(p).node = node;
+      set_size(size() + 1);
+    }
+
+    void append_item(LeafItem& i) {
+      leaf_item(size()) = i;
+      set_size(size() + 1);
+    }
+    void append_item(IndexItem& i) {
+      index_item(size()) = i;
+      set_size(size() + 1);
+    }
+
+    void split(Nodeptr& right) {
+      if (node->is_index()) {
+        for (int i=min_items(); i<size(); i++)
+          right.append_item( index_item(i) );
+      } else {
+        for (int i=min_items(); i<size(); i++)
+          right.append_item( leaf_item(i) );
+      }
+      set_size(min_items());
+    }
+
+    void merge(Nodeptr& right) {
+      if (node->is_index()) 
+        for (int i=0; i<right.size(); i++)
+          append_item( right.index_item(i) );
+      else 
+        for (int i=0; i<right.size(); i++)
+          append_item( right.leaf_item(i) );
+      right.set_size(0);
+    }
+
+  };
+
+  /*
+   */
+  class Cursor {
+  protected:
+  public:
+    static const int MATCH = 1;   // on key
+    static const int INSERT = 0;  // before key
+    static const int OOB = -1;    // at end
+
+    Table              *table;
+    vector<Nodeptr>     open;  // open nodes
+    vector<int>         pos;   // position within the node
+    //Nodeptr             open[20];
+    //int                 pos[20];
+    int                 level;
+
+    Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {}
+
+  public:
+
+    const LeafItem& current() {
+      assert(open[level].is_leaf());
+      return open[level].leaf_item(pos[level]);
+    }
+    V& dirty_current_value() {
+      assert(open[level].is_leaf());
+      dirty();
+      return open[level].leaf_item(pos[level]).value;
+    }
+
+    // ** read-only bits **
+    int move_left() {
+      if (table->depth == 0) return OOB; 
+
+      // work up around branch
+      int l;
+      for (l = level; l >= 0; l--) 
+        if (pos[l] > 0) break;
+      if (l < 0)
+        return OOB;   // we are the first item in the btree
+
+      // move left one
+      pos[l]--;
+      
+      // work back down right side
+      for (; l<level; l++) {
+        open[l+1] = table->pool.get_node( open[l].index_item(pos[l]).node );
+        pos[l+1] = open[l+1].size() - 1;
+      }
+      return 1;
+    }
+    int move_right() {
+      if (table->depth == 0) return OOB; 
+
+      // work up branch
+      int l;
+      for (l=level; l>=0; l--) 
+        if (pos[l] < open[l].size() - 1) break;
+      if (l < 0) {
+        /* we are at last item in btree. */
+        if (pos[level] < open[level].size()) {
+          pos[level]++;  /* move into add position! */
+          return 0;
+        }
+        return -1;  
+      }
+      
+      /* move right one */
+      assert( pos[l] < open[l].size() );  
+      pos[l]++;
+      
+      /* work back down */
+      for (; l<level; l++) {
+        open[l+1] = table->pool.get_node( open[l].index_item(pos[l]).node );
+        pos[l+1] = 0;  // furthest left
+      }
+      return 1;
+    }
+
+    // ** modifications **
+    void dirty() {
+      for (int l=level; l>=0; l--) {
+        if (open[l].node->is_dirty()) break;  // already dirty!  (and thus parents are too)
+        
+        table->pool.dirty_node(open[l].node);
+        if (l > 0)
+          open[l-1].index_item( pos[l-1] ).node = open[l].get_id();
+        else
+          table->root = open[0].get_id();
+      }
+    }
+  private:
+    void repair_parents() {
+      // did i make a change at the start of a node?
+      if (pos[level] == 0) {
+        K key = open[level].key(0);  // new key parents should have
+        for (int j=level-1; j>=0; j--) {
+          if (open[j].index_item(pos[j]).key == key)
+            break;  /* it's the same key, we can stop fixing */
+          open[j].index_item(pos[j]).key = key;
+          if (pos[j] > 0) break;  /* last in position 0.. */
+        }
+      }
+    }
+
+  public:
+    void remove() {
+      dirty();
+
+      // remove from node
+      open[level].remove_at_pos( pos[level] );
+      repair_parents();
+      
+      // was it a key?
+      if (level == table->depth-1)
+        table->nkeys--;
+    }
+
+    void insert(K key, V value) {
+      dirty();
+      
+      // insert
+      open[level].insert_at_leaf_pos(pos[level], key, value);
+      repair_parents();
+      
+      // was it a key?
+      if (level == table->depth-1)
+        table->nkeys++;
+    }
+
+    int rotate_left() {
+      if (level == 0) return -1;         // i am root
+      if (pos[level-1] == 0) return -1;  // nothing to left
+      
+      Nodeptr here = open[level];
+      Nodeptr parent = open[level-1];
+      Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node );
+      if (left.size() == left.max_items()) return -1;  // it's full
+
+      // make both dirty
+      dirty();
+      if (!left.node->is_dirty()) {
+        table->pool.dirty_node(left.node);
+        parent.index_item(pos[level-1]-1).node = left.get_id();
+      }
+      
+      dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl;
+      
+      /* add */
+      if (here.node->is_leaf())
+        left.append_item(here.leaf_item(0));
+      else
+        left.append_item(here.index_item(0));
+
+      /* remove */
+      here.remove_at_pos(0);
+
+      /* fix parent index for me */
+      parent.index_item( pos[level-1] ).key = here.key(0);
+      // we never have to update past immediate parent, since we're not at pos 0
+      
+      /* adjust cursor */
+      if (pos[level] > 0) 
+        pos[level]--;  
+      //else
+      //assert(1); /* if we were positioned here, we're equal */
+      /* if it was 0, then the shifted item == our key, and we can stay here safely. */
+      return 0;
+    }
+    int rotate_right() {
+      if (level == 0) return -1;         // i am root
+      if (pos[level-1] + 1 >= open[level-1].size()) return -1;  // nothing to right
+      
+      Nodeptr here = open[level];
+      Nodeptr parent = open[level-1];
+      Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node );
+      if (right.size() == right.max_items()) return -1;  // it's full
+      
+      // make both dirty
+      dirty();
+      if (!right.node->is_dirty()) {
+        table->pool.dirty_node(right.node);
+        parent.index_item( pos[level-1]+1 ).node = right.get_id();
+      }
+      
+      if (pos[level] == here.size()) {
+        /* let's just move the cursor over! */
+        //if (sizeof(K) == 8)
+          dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl;
+        open[level] = right;
+        pos[level] = 0;
+        pos[level-1]++;
+        return 0;
+      }
+
+      //if (sizeof(K) == 8)
+      dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from "
+             << here.get_id() << " to " << right.get_id() << endl;
+      
+      /* add */
+      if (here.is_index())
+        right.insert_at_index_pos(0, 
+                                  here.index_item( here.size()-1 ).key,
+                                  here.index_item( here.size()-1 ).node);
+      else
+        right.insert_at_leaf_pos(0, 
+                                 here.leaf_item( here.size()-1 ).key,
+                                 here.leaf_item( here.size()-1 ).value);
+      
+      /* remove */
+      here.set_size(here.size() - 1);
+
+      /* fix parent index for right */
+      parent.index_item( pos[level-1] + 1 ).key = right.key(0);
+      
+      return 0;
+    }
+  };
+
+
+ public:
+  bool almost_full() {
+    if (2*(depth+1) > pool.num_free())     // worst case, plus some.
+      return true;
+    return false;
+  }
+  
+  int find(K key, Cursor& cursor) {
+    dbtout << "find " << key << endl;
+
+    if (depth == 0)
+      return Cursor::OOB;
+
+    // init
+    cursor.level = 0;
+    
+    // start at root
+    Nodeptr curnode( pool.get_node(root) );
+    cursor.open[0] = curnode;
+
+    if (curnode.size() == 0) return -1;  // empty!
+
+    // find leaf
+    for (cursor.level = 0; cursor.level < depth-1; cursor.level++) {
+      /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6  (err to the left) */
+      int left = 0;                        /* i >= left */
+      int right = curnode.size()-1;        /* i < right */
+      while (left < right) {
+        int i = left + (right - left) / 2;
+        if (curnode.index_item(i).key < key) {
+          left = i + 1;
+        } else if (i && curnode.index_item(i-1).key >= key) {
+          right = i;
+        } else {
+          left = right = i;
+          break;
+        }
+      }
+      int i = left;
+      if (i && curnode.index_item(i).key > key) i--;
+      
+#ifdef EBOFS_DEBUG_BTREE
+      int j;
+      for (j=0; j<curnode.size()-1; j++) { 
+        if (curnode.index_item(j).key == key) break;  /* perfect */
+        if (curnode.index_item(j+1).key > key) break;
+      }
+      if (i != j) {
+        dbtout << "btree binary search failed" << endl;
+        i = j;
+      }
+#endif
+
+      cursor.pos[cursor.level] = i;   
+      
+      /* get child node */
+      curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node );
+      cursor.open[cursor.level+1] = curnode;
+    }
+
+    /* search leaf */
+    /*  if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6   (err to the right) */
+    int left = 0;                      /* i >= left */
+    int right = curnode.size();        /* i < right */
+    while (left < right) {
+      int i = left + (right - left) / 2;
+      if (curnode.leaf_item(i).key < key) {
+        left = i + 1;
+      } else if (i && curnode.leaf_item(i-1).key >= key) {
+        right = i;
+      } else {
+        left = right = i;
+        break;
+      }
+    }
+    int i = left;
+    
+#ifdef EBOFS_DEBUG_BTREE
+    int j;
+    for (j=0; j<curnode.size(); j++) {
+      if (curnode.leaf_item(j).key >= key) break; 
+    }
+    if (i != j) {
+      dbtout << "btree binary search failed" << endl;
+      i = j;
+    }
+#endif
+    
+    cursor.pos[cursor.level] = i;   /* first key in this node, or key insertion point */
+
+    if (curnode.size() >= i+1) {
+      if (curnode.leaf_item(i).key == key) {
+        return Cursor::MATCH;   /* it's the actual key */
+      } else {
+        return Cursor::INSERT;   /* it's an insertion point */
+      }
+    }
+    return Cursor::OOB;  /* it's the end of the btree (also a valid insertion point) */
+  }
+
+  int lookup(K key) {
+    dbtout << "lookup" << endl;
+    Cursor cursor(this);
+    if (find(key, cursor) == Cursor::MATCH) 
+      return 0;
+    return -1;
+  }
+
+  int lookup(K key, V& value) {
+    dbtout << "lookup" << endl;
+    Cursor cursor(this);
+    if (find(key, cursor) == Cursor::MATCH) {
+      value = cursor.current().value;
+      return 0;
+    }
+    return -1;
+  }
+
+  int insert(K key, V value) {
+    dbtout << "insert " << key << " -> " << value << endl;
+    if (almost_full()) return -1;
+    
+    // empty?
+    if (nkeys == 0) {
+      if (root == -1) {
+	// create a root node (leaf!)
+	assert(depth == 0);
+	Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) );
+	root = newroot.get_id();
+	depth++;
+      }
+      assert(depth == 1);
+      assert(root >= 0);
+    }
+
+    // start at/near key
+    Cursor cursor(this);
+    find(key, cursor);
+    
+    // insert loop
+    nodeid_t nodevalue = 0;
+    while (1) {
+      
+      /* room in this node? */
+      if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) {
+        if (cursor.open[cursor.level].is_leaf())
+          cursor.insert( key, value );   // will dirty, etc.
+        else {
+          // indices are already dirty
+          cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
+        }
+        verify("insert 1");
+        return 0;
+      }
+      
+      /* this node is full. */
+      assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() );
+
+      /* can we rotate? */
+      if (false)      // NO! there's a bug in here somewhere, don't to it.
+      if (cursor.level > 0) {
+        if ((cursor.pos[cursor.level-1] > 0 
+             && cursor.rotate_left() >= 0) ||
+            (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size()
+             && cursor.rotate_right() >= 0)) {
+          
+          if (cursor.open[cursor.level].is_leaf())
+            cursor.insert( key, value );   // will dirty, etc.
+          else {
+            // indices are already dirty
+            cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
+          }
+          verify("insert 2");
+          return 0;
+        }
+      }
+
+      /** split node **/
+
+      if (cursor.level == depth-1) {
+        dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl;
+      } else {
+        dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl;
+      }
+      
+      cursor.dirty();
+      
+      // split
+      Nodeptr leftnode = cursor.open[cursor.level];
+      Nodeptr newnode( pool.new_node(leftnode.node->get_type()) );
+      leftnode.split( newnode );
+
+      /* insert our item */
+      if (cursor.pos[cursor.level] > leftnode.size()) {
+        // not with cursor, since this node isn't added yet!
+        if (newnode.is_leaf()) {
+          newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(),
+                                      key, value );
+          nkeys++;
+        } else {
+          newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(),
+                                       key, nodevalue );
+        }
+      } else {
+        // with cursor (if leaf)
+        if (leftnode.is_leaf())
+          cursor.insert( key, value );
+        else 
+          leftnode.insert_at_index_pos( cursor.pos[cursor.level],
+                                        key, nodevalue );
+      }
+
+      /* are we at the root? */
+      if (cursor.level == 0) {
+        /* split root. */
+        dbtout << "that split was the root " << root << endl;
+        Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) );
+        
+        /* new root node */
+        newroot.set_size(2);
+        newroot.index_item(0).key = leftnode.key(0);
+        newroot.index_item(0).node = root;
+        newroot.index_item(1).key = newnode.key(0);
+        newroot.index_item(1).node = newnode.get_id();
+        
+        /* heighten tree */
+        depth++;
+        root = newroot.get_id();
+        verify("insert 3");
+        return 0;
+      }
+
+      /* now insert newindex in level-1 */
+      nodevalue = newnode.get_id();
+      key = newnode.key(0);
+      cursor.level--;
+      cursor.pos[cursor.level]++;   // ...to the right of leftnode!
+    }
+  }
+
+
+  int remove(K key) {
+    dbtout << "remove " << key << endl;
+
+    if (almost_full()) {
+      cout << "table almost full, failing" << endl;
+      assert(0);
+      return -1;
+    }
+    
+    Cursor cursor(this);
+    if (find(key, cursor) <= 0) {
+      cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl;
+      g_conf.debug_ebofs = 33;
+      g_conf.ebofs_verify = true;
+      verify("remove dne"); 
+      assert(0);
+      return -1;  // key dne
+    }
+
+
+    while (1) {
+      cursor.remove();
+      
+      // balance + adjust
+      
+      if (cursor.level == 0) {
+        // useless root index?
+        if (cursor.open[0].size() == 1 &&
+            depth > 1) {
+          depth--;
+          root = cursor.open[0].index_item(0).node;
+          pool.release( cursor.open[0].node );
+        }
+
+        // note: root can be small, but not empty
+        else if (nkeys == 0) {
+          assert(cursor.open[cursor.level].size() == 0);
+          assert(depth == 1);
+          root = -1;
+          depth = 0;
+          pool.release(cursor.open[0].node);
+        }
+        verify("remove 1");
+        return 0;
+      }
+      
+      if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) {
+        verify("remove 2");
+        return 0;
+      }
+      
+      // borrow from siblings?
+      Nodeptr left;
+      Nodeptr right;
+
+      // left?
+      if (cursor.pos[cursor.level-1] > 0) {
+        int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node;
+        left = pool.get_node( left_loc );
+
+        if (left.size() > left.min_items()) {
+          /* move cursor left, shift right */
+          cursor.pos[cursor.level] = 0;
+          cursor.open[cursor.level] = left;
+          cursor.pos[cursor.level-1]--;
+          cursor.rotate_right();
+          verify("remove 3");
+          return 0;
+        }
+        
+        /* combine to left */
+        right = cursor.open[cursor.level];
+      }
+      else {
+        assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1);
+        int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node;
+        right = pool.get_node( right_loc );
+        
+        if (right.size() > right.min_items()) {
+          /* move cursor right, shift an item left */
+          cursor.pos[cursor.level] = 1;
+          cursor.open[cursor.level] = right;
+          cursor.pos[cursor.level-1]++;
+          cursor.rotate_left();
+          verify("remove 4");
+          return 0;
+        }
+        
+        /* combine to left */
+        left = cursor.open[cursor.level];
+        cursor.pos[cursor.level-1]++;  /* move cursor to (soon-to-be-empty) right side item */
+      }
+
+      // note: cursor now points to _right_ node.
+      
+      /* combine (towards left) 
+       * (this makes it so our next delete will be in the index 
+       * interior, which is less scary.)
+       */
+      dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl;
+
+      left.merge(right);
+      
+      // dirty left + right
+      cursor.dirty();            // right
+      if (!left.node->is_dirty()) {
+        pool.dirty_node(left.node);
+        cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id();
+      }
+
+      pool.release(right.node);
+      
+      cursor.level--;  // now point to the link to the obsolete (right-side) sib */
+    }
+
+  }
+
+  void clear(Cursor& cursor, int node_loc, int level) {
+    dbtout << "clear" << endl;
+
+    Nodeptr node = pool.get_node( node_loc );
+    cursor.open[level] = node;
+    
+    // hose children?
+    if (level < depth-1) {   
+      for (int i=0; i<node.size(); i++) {
+        // index
+        cursor.pos[level] = i;
+        nodeid_t child = cursor.open[level].index_item(i).node;
+        clear( cursor, child, level+1 );
+      }      
+    }
+
+    // hose myself
+    pool.release( node.node );
+  }
+  
+  void clear() {
+    Cursor cursor(this);
+    if (root == -1 && depth == 0) return;   // already empty!
+    clear(cursor, root, 0);
+    root = -1;
+    depth = 0;
+    nkeys = 0;
+  }
+
+  int verify_sub(Cursor& cursor, int node_loc, int level, int& count, K& last, const char *on) {
+    int err = 0;
+
+    Nodeptr node = pool.get_node( node_loc );
+    cursor.open[level] = node;
+    
+    // identify max, min, and validate key range
+    K min = node.key(0);
+    last = min;
+    K max = min;
+    for (int i=0; i<node.size(); i++) {
+      if (i && node.key(i) <= last) {
+        dbtout << ":: key " << i << " " << hex << node.key(i) << dec << " in node " << node_loc 
+               << " is out of order, last is " << hex << last << dec << endl;
+        err++;
+      }
+      if (node.key(i) > max)
+        max = node.key(i);
+      
+      if (level < depth-1) {   
+        // index
+        cursor.pos[level] = i;
+        err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on );
+      } else {
+        // leaf
+        count++;
+        last = node.key(i);
+      }
+    }
+    
+    if (level) {
+      // verify that parent's keys are appropriate
+      if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) {
+        dbtout << ":: key in index node " << cursor.open[level-1].get_id()
+               << " != min in child " << node_loc 
+               << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key
+               << ", min is " << min << ")" << dec << endl;
+        err++;
+      }
+      if (cursor.pos[level-1] < cursor.open[level-1].size()-1) {
+        if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) {
+          dbtout << ":: next key in index node " << cursor.open[level-1].get_id()
+                 << " < max in child " << node_loc 
+                 << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key
+                 << ", max is " << max << ")" << dec << endl;
+          err++;
+        }
+      }
+    }
+    
+    //return err;
+    
+    // print it
+    char s[1000];
+    strcpy(s,"           ");
+    s[level+1] = 0;
+    if (1) {
+      if (root == node_loc) {
+        dbtout << s << "root " << node_loc << ": "
+               << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+      } else if (level == depth-1) {
+        dbtout << s << "leaf " << node_loc << ": "
+               << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+      } else {
+        dbtout << s << "indx " << node_loc << ": "
+               << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+      }
+
+      if (0) {
+        for (int i=0; i<node.size(); i++) {
+          if (level < depth-1) {          // index
+            dbtout << s << "   " << hex << node.key(i) << " [" << node.index_item(i).node << "]" << dec << endl;
+          } else {          // leaf
+            dbtout << s << "   " << hex << node.key(i) << " -> " << node.leaf_item(i).value << dec << endl;
+          }
+        }
+      }
+    }
+    
+    return err;
+  }
+
+  void verify(const char *on) {
+    if (!g_conf.ebofs_verify) 
+      return;
+
+    if (root == -1 && depth == 0) {
+      return;   // empty!
+    }
+
+    int count = 0;
+    Cursor cursor(this);
+    K last;
+    
+    int before = g_conf.debug_ebofs;
+    g_conf.debug_ebofs = 0;
+
+    int err = verify_sub(cursor, root, 0, count, last, on);
+    if (count != nkeys) {
+      cerr << "** count " << count << " != nkeys " << nkeys << endl;
+      err++;
+    }
+
+    g_conf.debug_ebofs = before;
+
+    // ok?
+    if (err) {
+      cerr << "verify failure, called by '" << on << "'" << endl;
+      g_conf.debug_ebofs = 30;
+      // do it again, so we definitely get the dump.
+      int count = 0;
+      Cursor cursor(this);
+      K last;
+      verify_sub(cursor, root, 0, count, last, on);
+      assert(err == 0);
+    }
+  }
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc b/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc
new file mode 100644
index 0000000000000..af5f57842068a
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <iostream>
+#include "ebofs/Ebofs.h"
+
+
+int main(int argc, char **argv)
+{
+  // args
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+  parse_config_options(args);
+
+  if (args.size() < 1) {
+    cerr << "usage: mkfs.ebofs [options] <device file>" << endl;
+    return -1;
+  }
+  char *filename = args[0];
+
+  // mkfs
+  Ebofs mfs(filename);
+  int r = mfs.mkfs();
+  if (r < 0) exit(r);
+
+  if (args.size() > 1) {   // pass an extra arg of some sort to trigger the test crapola
+    // test-o-rama!
+    Ebofs fs(filename);
+    fs.mount();
+    
+    /*
+    if (1) {
+      // partial write tests
+      char crap[1024*1024];
+      memset(crap, 0, 1024*1024);
+
+      bufferlist small;
+      small.append(crap, 10);
+      bufferlist med;
+      med.append(crap, 1000);
+      bufferlist big;
+      big.append(crap, 1024*1024);
+
+      cout << "0" << endl;
+      fs.write(10, 0, 1024*1024, big, (Context*)0);
+      fs.sync();
+      fs.trim_buffer_cache();
+
+      cout << "1" << endl;
+      fs.write(10, 10, 10, small, 0);
+      fs.write(10, 1, 1000, med, 0);
+      fs.sync();
+      fs.trim_buffer_cache();
+
+      cout << "2" << endl;
+      fs.write(10, 10, 10, small, 0);
+      //fs.sync();
+      fs.write(10, 1, 1000, med, 0);
+      fs.sync();
+      fs.trim_buffer_cache();
+
+      cout << "3" << endl;
+      fs.write(10, 1, 1000, med, 0);
+      fs.write(10, 10000, 10, small, 0);
+      fs.truncate(10, 100, 0);
+      fs.sync();
+      fs.trim_buffer_cache();
+
+      cout << "4" << endl;
+      fs.remove(10);
+      fs.sync();
+      fs.write(10, 10, 10, small, 0);
+      fs.sync();
+      fs.write(10, 1, 1000, med, 0);
+      fs.sync();
+      fs.truncate(10, 100, 0);
+      fs.write(10, 10, 10, small, 0);
+      fs.trim_buffer_cache();
+
+      
+
+    }
+
+    if (0) { // onode write+read test
+      bufferlist bl;
+      char crap[1024*1024];
+      memset(crap, 0, 1024*1024);
+      bl.append(crap, 10);
+
+      fs.write(10, 10, 0, bl, (Context*)0);
+      fs.umount();
+
+      Ebofs fs2(filename);
+      fs2.mount();
+      fs2.read(10, 10, 0, bl);
+      fs2.umount();
+
+      return 0;
+    }
+
+
+    if (0) {  // small write + read test
+      bufferlist bl;
+      char crap[1024*1024];
+      memset(crap, 0, 1024*1024);
+
+      object_t oid = 10;
+      int n = 10000;
+      int l = 128;
+      bl.append(crap, l);
+
+
+      char *p = bl.c_str();
+      off_t o = 0;
+      for (int i=0; i<n; i++) {
+        cout << "write at " << o << endl;
+        for (int j=0;j<l;j++) 
+          p[j] = (char)(oid^(o+j));
+        fs.write(oid, l, o, bl, (Context*)0);
+        o += l;
+      }
+
+      fs.sync();
+      fs.trim_buffer_cache();
+
+      o = 0;
+      for (int i=0; i<n; i++) {
+        cout << "read at " << o << endl;
+        bl.clear();
+        fs.read(oid, l, o, bl);
+        
+        char b[l];
+        bl.copy(0, l, b);
+        char *p = b;
+        int left = l;
+        while (left--) {
+          assert(*p == (char)(o ^ oid));
+          o++;
+          p++;
+        }
+      }
+
+    }
+
+    if (0) { // big write speed test
+      bufferlist bl;
+      char crap[1024*1024];
+      memset(crap, 0, 1024*1024);
+      bl.append(crap, 1024*1024);
+      
+      int megs = 1000;
+
+      utime_t start = g_clock.now();
+
+      for (off_t m=0; m<megs; m++) {
+        //if (m%100 == 0)
+          cout << m << " / " << megs << endl;
+        fs.write(10, bl.length(), 1024LL*1024LL*m, bl, (Context*)0);
+      }      
+      fs.sync();
+
+      utime_t end = g_clock.now();
+      end -= start;
+
+      dout(1) << "elapsed " << end << endl;
+      
+      float mbs = (float)megs / (float)end;
+      dout(1) << "mb/s " << mbs << endl;
+    }
+    
+    if (0) {  // test
+      bufferlist bl;
+      char crap[10000];
+      memset(crap, 0, 10000);
+      bl.append(crap, 10000);
+      fs.write(10, bl.length(), 200, bl, (Context*)0);
+      fs.trim_buffer_cache();
+      fs.write(10, bl.length(), 5222, bl, (Context*)0);
+      sleep(1);
+      fs.trim_buffer_cache();
+      fs.write(10, 5000, 3222, bl, (Context*)0);
+    }
+    
+    // test small writes
+    if (0) {
+      char crap[1024*1024];
+      memset(crap, 0, 1024*1024);
+      bufferlist bl;
+      bl.append(crap, 1024*1024);
+      
+      // reandom write
+      if (1) {
+        srand(0);
+        for (int i=0; i<10000; i++) {
+          off_t off = rand() % 1000000;
+          size_t len = 1+rand() % 10000;
+          cout << endl << i << " writing bit at " << off << " len " << len << endl;
+          fs.write(10, len, off, bl, (Context*)0);
+          //fs.sync();
+          //fs.trim_buffer_cache();
+        }
+        fs.remove(10);
+        for (int i=0; i<100; i++) {
+          off_t off = rand() % 1000000;
+          size_t len = 1+rand() % 10000;
+          cout << endl << i << " writing bit at " << off << " len " << len << endl;
+          fs.write(10, len, off, bl, (Context*)0);
+          //fs.sync();
+          //fs.trim_buffer_cache();
+        }
+      }
+      
+      if (0) {
+        // sequential write
+        srand(0);
+        off_t off = 0;
+        for (int i=0; i<10000; i++) {
+          size_t len = 1024*1024;//1+rand() % 10000;
+          cout << endl << i << " writing bit at " << off << " len " << len << endl;
+          fs.write(10, len, off, bl, (Context*)0);
+          off += len;
+        }
+
+      }
+      
+      
+      if (0) {
+        // read
+        srand(0);
+        for (int i=0; i<100; i++) {
+          bufferlist bl;
+          off_t off = rand() % 1000000;
+          size_t len = rand() % 1000;
+          cout << endl << "read bit at " << off << " len " << len << endl;
+          int r = fs.read(10, len, off, bl);
+          assert(bl.length() == len);
+          assert(r == (int)len);
+        }
+      }
+      
+      // flush
+      fs.sync();
+      fs.trim_buffer_cache();
+      //fs.trim_buffer_cache();
+      
+      if (0) {
+        // read again
+        srand(0);
+        for (int i=0; i<100; i++) {
+          bufferlist bl;
+          off_t off = rand() % 1000000;
+          size_t len = 100;
+          cout << endl << "read bit at " << off << " len " << len << endl;
+          int r = fs.read(10, len, off, bl);
+          assert(bl.length() == len);
+          assert(r == (int)len);
+        }
+        
+        // flush
+        fs.sync();
+        fs.trim_buffer_cache();
+      }
+      
+      if (0) {
+        // write on empty cache
+        srand(0);
+        for (int i=0; i<100; i++) {
+          off_t off = rand() % 1000000;
+          size_t len = 100;
+          cout << endl <<  "writing bit at " << off << " len " << len << endl;
+          fs.write(10, len, off, bl, (Context*)0);
+        }
+      }
+      
+    }
+    */
+    
+    fs.sync();
+    fs.trim_buffer_cache();
+    
+    fs.umount();
+  }
+
+  return 0;
+}
+
+    
diff --git a/branches/sage/cephmds2/ebofs/nodes.h b/branches/sage/cephmds2/ebofs/nodes.h
new file mode 100644
index 0000000000000..01ad53848ca3d
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/nodes.h
@@ -0,0 +1,583 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_NODES_H
+#define __EBOFS_NODES_H
+
+/** nodes, node regions **/
+
+#include "types.h"
+#include "BlockDevice.h"
+
+
+/*
+
+     disk     wire    memory                
+
+     free             free    -> free             can alloc
+     free             used    -> dirty            can modify
+
+     free     used    used    -> tx
+     free     used    free    -> limbo 
+
+     used             used    -> clean
+     used             free    -> limbo
+
+
+        // meaningless
+     used     free    free    -> free             can alloc
+     used     free    used    __DNE__
+
+
+*/
+
+#undef debofs
+#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool."
+
+
+class Node {
+ public:
+  // bit fields
+  static const int STATE_CLEAN = 1;   
+  static const int STATE_DIRTY = 2; 
+  static const int STATE_TX = 3;
+
+  static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int);
+
+  static const int TYPE_INDEX = 1;
+  static const int TYPE_LEAF = 2;
+
+ protected:
+  nodeid_t    id;
+  int         state;     // use bit fields above!
+
+  bufferptr   bptr;
+  bufferptr   shadow_bptr;
+
+  // in disk buffer
+  int         *type;
+  int         *nrecs;
+
+ public:
+  Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b)  {
+    nrecs = (int*)(bptr.c_str());
+    type = (int*)(bptr.c_str() + sizeof(*nrecs));
+  }
+
+  
+  // id
+  nodeid_t get_id() const { return id; }
+  void set_id(nodeid_t n) { id = n; }
+
+  // buffer
+  bufferptr& get_buffer() { return bptr; }
+
+  char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); }
+
+  // size
+  int size() { return *nrecs; }
+  void set_size(int s) { *nrecs = s; }
+  
+  // type
+  int& get_type() { return *type; }
+  void set_type(int t) { *type = t; }
+  bool is_index() { return *type == TYPE_INDEX; }
+  bool is_leaf() { return *type == TYPE_LEAF; } 
+
+
+  // state
+  bool is_dirty() { return state == STATE_DIRTY; }
+  bool is_tx() { return state == STATE_TX; }
+  bool is_clean() { return state == STATE_CLEAN; }
+
+  void set_state(int s) { state = s; }
+
+  void make_shadow() {
+    assert(is_tx());
+    
+    shadow_bptr = bptr;
+    
+    // new buffer
+    bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+    nrecs = (int*)(bptr.c_str());
+    type = (int*)(bptr.c_str() + sizeof(*nrecs));
+    
+    // copy contents!
+    memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES);
+  }
+
+};
+
+
+
+
+
+class NodePool {
+ protected:
+  map<nodeid_t, Node*>  node_map;      // open node map
+  
+ public:
+  vector<Extent> region_loc;    // region locations
+  Extent         usemap_even;
+  Extent         usemap_odd;
+  
+ protected:
+  // on-disk block states
+  int num_nodes;
+  set<nodeid_t> free;
+  set<nodeid_t> dirty;
+  set<nodeid_t> tx;
+  set<nodeid_t> clean;       // aka used
+  set<nodeid_t> limbo;
+  
+  Mutex        &ebofs_lock;
+  Cond          commit_cond;
+  int           flushing;
+
+  static int make_nodeid(int region, int offset) {
+    return (region << 24) | offset;
+  }
+  static int nodeid_region(nodeid_t nid) {
+    return nid >> 24;
+  }
+  static int nodeid_offset(nodeid_t nid) {
+    return nid & ((1 << 24) - 1);
+  }
+
+
+ public:
+  NodePool(Mutex &el) : 
+    num_nodes(0),
+    ebofs_lock(el),
+    flushing(0) {}
+  ~NodePool() {
+    // nodes
+    release_all();
+  }
+
+  int num_free() { return free.size(); }
+  int num_dirty() { return dirty.size(); }
+  int num_limbo() { return limbo.size(); }
+  int num_tx() { return tx.size(); }
+  int num_clean() { return clean.size(); }
+  int num_total() { return num_nodes; }
+  int num_used() { return num_clean() + num_dirty() + num_tx(); }
+
+  int get_usemap_len(int n=0) {
+    if (n == 0) n = num_nodes;
+    return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1;
+  }
+
+  int num_regions() { return region_loc.size(); }
+
+  // the caller had better adjust usemap locations...
+  void add_region(Extent ex) {
+    int region = region_loc.size();
+    assert(ex.length <= (1 << 24));
+    region_loc.push_back(ex);
+    for (unsigned o = 0; o < ex.length; o++) {
+      free.insert( make_nodeid(region, o) );
+    }
+    num_nodes += ex.length;
+  }
+  
+  int init(struct ebofs_nodepool *np) {
+    // regions
+    assert(region_loc.empty());
+    num_nodes = 0;
+    for (int i=0; i<np->num_regions; i++) {
+      debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl;
+      region_loc.push_back( np->region_loc[i] );
+      num_nodes += np->region_loc[i].length;
+    }
+
+    // usemap
+    usemap_even = np->node_usemap_even;
+    usemap_odd = np->node_usemap_odd;
+    debofs(3) << "init even map at " << usemap_even << endl;
+    debofs(3) << "init  odd map at " << usemap_odd << endl;
+
+    return 0;
+  }
+
+  void close() {
+    release_all();
+    
+    region_loc.clear();
+    free.clear();
+    dirty.clear();
+    tx.clear();
+    clean.clear();
+    limbo.clear();
+    flushing = 0;
+    node_map.clear();
+  }
+
+
+  // *** blocking i/o routines ***
+
+  int read_usemap(BlockDevice& dev, version_t epoch) {
+    // read map
+    Extent loc;
+    if (epoch & 1) 
+      loc = usemap_odd;
+    else 
+      loc = usemap_even;
+
+    bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
+    dev.read(loc.start, loc.length, bp);
+    
+    // parse
+    unsigned region = 0;  // current region
+    unsigned roff = 0;    // offset in region
+    for (unsigned byte = 0; byte<bp.length(); byte++) {   // each byte
+      // get byte
+      int x = *(unsigned char*)(bp.c_str() + byte);
+      int mask = 0x80;  // left-most bit
+      for (unsigned bit=0; bit<8; bit++) {
+        nodeid_t nid = make_nodeid(region, roff);
+        
+        if (x & mask)
+          clean.insert(nid);
+        else
+          free.insert(nid);
+
+        mask = mask >> 1;  // move one bit right.
+        roff++;
+        if (roff == region_loc[region].length) {
+          // next region!
+          roff = 0;
+          region++;
+          break;
+        }
+      }     
+      if (region == region_loc.size()) break;
+    }    
+    return 0;
+  }
+
+  int read_clean_nodes(BlockDevice& dev) {
+    /*
+      this relies on the clean set begin defined so that we know which nodes
+      to read.  so it only really works when called from mount()!
+    */
+    for (unsigned r=0; r<region_loc.size(); r++) {
+      debofs(3) << "ebofs.nodepool.read region " << r << " at " << region_loc[r] << endl;
+      
+      for (block_t boff = 0; boff < region_loc[r].length; boff++) {
+        nodeid_t nid = make_nodeid(r, boff);
+        
+        if (!clean.count(nid)) continue;  
+        debofs(20) << "ebofs.nodepool.read  node " << nid << endl;
+
+        bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+        dev.read(region_loc[r].start + (block_t)boff, EBOFS_NODE_BLOCKS, 
+                 bp);
+        
+        Node *n = new Node(nid, bp, Node::STATE_CLEAN);
+        node_map[nid] = n;
+        debofs(10) << "ebofs.nodepool.read  node " << n << " at " << (void*)n << endl;
+      }
+    }
+    return 0;
+  }
+
+
+
+  // **** non-blocking i/o ****
+
+ private:
+  class C_NP_FlushUsemap : public BlockDevice::callback {
+    NodePool *pool;
+  public:
+    C_NP_FlushUsemap(NodePool *p) : 
+      pool(p) {}
+    void finish(ioh_t ioh, int r) {
+      pool->flushed_usemap();
+    }
+  };
+  
+  void flushed_usemap() {
+    ebofs_lock.Lock();
+    flushing--;
+    if (flushing == 0) 
+      commit_cond.Signal();
+    ebofs_lock.Unlock();
+  }
+
+ public:
+  int write_usemap(BlockDevice& dev, version_t version) {
+    // alloc
+    Extent loc;
+    if (version & 1) 
+      loc = usemap_odd;
+    else 
+      loc = usemap_even;
+    
+    bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
+
+    // fill in
+    unsigned region = 0;  // current region
+    unsigned roff = 0;    // offset in region
+    for (unsigned byte = 0; byte<bp.length(); byte++) {   // each byte
+      int x = 0;        // start with empty byte
+      int mask = 0x80;  // left-most bit
+      for (unsigned bit=0; bit<8; bit++) {
+        nodeid_t nid = make_nodeid(region, roff);
+        
+        if (clean.count(nid) ||
+            dirty.count(nid))
+          x |= mask;
+
+        roff++;
+        mask = mask >> 1;
+        if (roff == region_loc[region].length) {
+          // next region!
+          roff = 0;
+          region++;
+          break;
+        }
+      }
+
+      *(unsigned char*)(bp.c_str() + byte) = x;
+      if (region == region_loc.size()) break;
+    }
+
+
+    // write
+    bufferlist bl;
+    bl.append(bp);
+    dev.write(loc.start, loc.length, bl,
+              new C_NP_FlushUsemap(this), "usemap");
+    return 0;
+  }
+
+
+
+  // *** node commit ***
+ private:
+ 
+  class C_NP_FlushNode : public BlockDevice::callback {
+    NodePool *pool;
+    nodeid_t nid;
+  public:
+    C_NP_FlushNode(NodePool *p, nodeid_t n) : 
+      pool(p), nid(n) {}
+    void finish(ioh_t ioh, int r) {
+      pool->flushed_node(nid);
+    }
+  };
+
+  void flushed_node(nodeid_t nid) {
+    ebofs_lock.Lock();
+    
+    // mark nid clean|limbo
+    if (tx.count(nid)) {  // tx -> clean
+      tx.erase(nid);
+      clean.insert(nid);
+
+      // make node itself clean
+      node_map[nid]->set_state(Node::STATE_CLEAN);
+    }
+    else {  // already limbo  (was dirtied, or released)
+      assert(limbo.count(nid));
+    }
+
+    flushing--;
+    if (flushing == 0) 
+      commit_cond.Signal();
+    ebofs_lock.Unlock();
+  }
+
+ public:
+  void commit_start(BlockDevice& dev, version_t version) {
+    dout(20) << "ebofs.nodepool.commit_start start" << endl;
+
+    assert(flushing == 0);
+    /*if (0)
+      for (unsigned i=0; i<region_loc.size(); i++) {
+        int c = dev.count_io(region_loc[i].start, region_loc[i].length);
+        dout(20) << "ebofs.nodepool.commit_start  region " << region_loc[i] << " has " << c << " ios" << endl;
+        assert(c == 0);
+      }
+    */
+
+    // write map
+    flushing++;
+    write_usemap(dev,version & 1);
+
+    // dirty -> tx  (write to disk)
+    assert(tx.empty());
+    set<block_t> didb;
+    for (set<nodeid_t>::iterator i = dirty.begin();
+         i != dirty.end();
+         i++) {
+      Node *n = get_node(*i);
+      assert(n);
+      assert(n->is_dirty());
+      n->set_state(Node::STATE_TX);
+
+      unsigned region = nodeid_region(*i);
+      block_t off = nodeid_offset(*i);
+      block_t b = region_loc[region].start + off;
+
+      if (1) {  // sanity check debug FIXME
+        assert(didb.count(b) == 0);
+        didb.insert(b);
+      }
+
+      bufferlist bl;
+      bl.append(n->get_buffer());
+      dev.write(b, EBOFS_NODE_BLOCKS, 
+                bl,
+                new C_NP_FlushNode(this, *i), "node");
+      flushing++;
+
+      tx.insert(*i);
+    }
+    dirty.clear();
+
+    // limbo -> free
+    for (set<nodeid_t>::iterator i = limbo.begin();
+         i != limbo.end();
+         i++) {
+      free.insert(*i);
+    }
+    limbo.clear();
+
+    dout(20) << "ebofs.nodepool.commit_start finish" << endl;
+  }
+
+  void commit_wait() {
+    while (flushing > 0) 
+      commit_cond.Wait(ebofs_lock);
+    dout(20) << "ebofs.nodepool.commit_wait finish" << endl;
+  }
+
+
+
+
+
+
+   
+
+
+  // *** nodes ***
+  // opened node
+  Node* get_node(nodeid_t nid) {
+    //dbtout << "pool.get " << nid << endl;
+    assert(node_map.count(nid));
+    return node_map[nid];
+  }
+  
+  // unopened node
+  /*  not implemented yet!!
+  Node* open_node(nodeid_t nid) {
+    Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid);
+    dbtout << "pool.open_node " << n->get_id() << endl;
+    node_map[n->get_id()] = n;
+    return n;
+  }
+  */
+  
+  // allocate id/block on disk.  always free -> dirty.
+  nodeid_t alloc_id() {
+    // pick node id
+    assert(!free.empty());
+    nodeid_t nid = *(free.begin());
+    free.erase(nid);
+    dirty.insert(nid);
+    return nid;
+  }
+  
+  // new node
+  Node* new_node(int type) {
+    nodeid_t nid = alloc_id();
+    debofs(15) << "ebofs.nodepool.new_node " << nid << endl;
+    
+    // alloc node
+    bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+    Node *n = new Node(nid, bp, Node::STATE_DIRTY);
+    n->set_type(type);
+    n->set_size(0);
+
+    assert(node_map.count(nid) == 0);
+    node_map[nid] = n;
+    return n;
+  }
+
+  void release(Node *n) {
+    const nodeid_t nid = n->get_id();
+    debofs(15) << "ebofs.nodepool.release on " << nid << endl;
+    node_map.erase(nid);
+
+    if (n->is_dirty()) {
+      assert(dirty.count(nid));
+      dirty.erase(nid);
+      free.insert(nid);
+    } else if (n->is_clean()) {
+      assert(clean.count(nid));
+      clean.erase(nid);
+      limbo.insert(nid);
+    } else if (n->is_tx()) {
+      assert(tx.count(nid));      // i guess htis happens? -sage
+      tx.erase(nid);
+      limbo.insert(nid);
+    }
+
+    delete n;
+  }
+
+  void release_all() {
+    while (!node_map.empty()) {
+      map<nodeid_t,Node*>::iterator i = node_map.begin();
+      debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl;
+      release( i->second );
+    }
+    assert(node_map.empty());
+  }
+
+  void dirty_node(Node *n) {
+    // get new node id?
+    nodeid_t oldid = n->get_id();
+    nodeid_t newid = alloc_id();
+    debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl;
+    
+    // release old block
+    if (n->is_clean()) {
+      assert(clean.count(oldid));
+      clean.erase(oldid);
+    } else {
+      assert(n->is_tx());
+      assert(tx.count(oldid));
+      tx.erase(oldid);
+      
+      // move/copy current -> shadow buffer as necessary
+      n->make_shadow();   
+    }
+    limbo.insert(oldid);
+    node_map.erase(oldid);
+    
+    n->set_state(Node::STATE_DIRTY);
+    
+    // move to new one!
+    n->set_id(newid);
+    node_map[newid] = n;
+  }
+  
+  
+  
+};
+  
+#endif
diff --git a/branches/sage/cephmds2/ebofs/test.ebofs.cc b/branches/sage/cephmds2/ebofs/test.ebofs.cc
new file mode 100644
index 0000000000000..0e6a7625c502a
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/test.ebofs.cc
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <iostream>
+#include "ebofs/Ebofs.h"
+
+bool stop = false;
+
+
+int nt = 0;
+class Tester : public Thread {
+  Ebofs &fs;
+  int t;
+  
+  char b[1024*1024];
+
+public:
+  Tester(Ebofs &e) : fs(e), t(nt) { nt++; }
+  void *entry() {
+
+    while (!stop) {
+      object_t oid;
+      oid.ino = (rand() % 10) + 0x10000000;
+      coll_t cid = rand() % 50;
+      off_t off = rand() % 10000;//0;//rand() % 1000000;
+      off_t len = 1+rand() % 100000;
+      char *a = "one";
+      if (rand() % 2) a = "two";
+      int l = 3;//rand() % 10;
+
+      switch (rand() % 10) {
+      case 0:
+        {
+	  oid.rev = rand() % 10;
+          cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl;
+          bufferlist bl;
+          fs.read(oid, off, len, bl);
+          int l = MIN(len,bl.length());
+          if (l) {
+            cout << t << " got " << l << endl;
+            bl.copy(0, l, b);
+            char *p = b;
+            while (l--) {
+              assert(*p == 0 ||
+                     *p == (char)(off ^ oid.ino));
+              off++;
+              p++;
+            }
+          }
+        }
+        break;
+
+      case 1:
+        {
+          cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl;
+          for (int j=0;j<len;j++) 
+            b[j] = (char)(oid.ino^(off+j));
+	  bufferptr wp(b, len);
+          bufferlist w;
+          w.append(wp);
+          fs.write(oid, off, len, w, 0);
+        }
+        break;
+
+      case 2:
+        cout << t << " remove " << hex << oid << dec <<  endl;
+        fs.remove(oid);
+        break;
+
+      case 3:
+        cout << t << " collection_add " << hex << oid << dec <<  " to " << cid << endl;
+        fs.collection_add(cid, oid, 0);
+        break;
+
+      case 4:
+        cout << t << " collection_remove " << hex << oid << dec <<  " from " << cid << endl;
+        fs.collection_remove(cid, oid, 0);
+        break;
+
+      case 5:
+        cout << t << " setattr " << hex << oid << dec <<  " " << a << " len " << l << endl;
+        fs.setattr(oid, a, (void*)a, l, 0);
+        break;
+        
+      case 6:
+        cout << t << " rmattr " << hex << oid << dec <<  " " << a << endl;
+        fs.rmattr(oid,a);
+        break;
+
+      case 7:
+        {
+          char v[4];
+          cout << t << " getattr " << hex << oid << dec <<  " " << a << endl;
+          if (fs.getattr(oid,a,(void*)v,3) == 0) {
+            v[3] = 0;
+            assert(strcmp(v,a) == 0);
+          }
+        }
+        break;
+        
+      case 8:
+        {
+          cout << t << " truncate " << hex << oid << dec <<  " " << off << endl;
+          fs.truncate(oid, 0);
+        }
+        break;
+
+      case 9:
+	{
+	  object_t newoid = oid;
+	  newoid.rev = rand() % 10;
+	  cout << t << " clone " << oid << " to " << newoid << endl;
+	  fs.clone(oid, newoid, 0);
+	}
+      }
+
+
+    }
+    cout << t << " done" << endl;
+    return 0;
+  }
+};
+
+int main(int argc, char **argv)
+{
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+  parse_config_options(args);
+
+  // args
+  if (args.size() != 3) return -1;
+  char *filename = args[0];
+  int seconds = atoi(args[1]);
+  int threads = atoi(args[2]);
+
+  cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl;
+
+  Ebofs fs(filename);
+  if (fs.mount() < 0) return -1;
+
+
+  // explicit tests
+  if (1) {
+    // verify that clone() plays nice with partial writes
+    object_t oid(1,1);
+    bufferptr bp(10000);
+    bp.zero();
+    bufferlist bl;
+    bl.push_back(bp);
+    fs.write(oid, 0, 10000, bl, 0);
+
+    fs.sync();
+    fs.trim_buffer_cache();
+
+    // induce a partial write
+    bufferlist bl2;
+    bl2.substr_of(bl, 0, 100);
+    fs.write(oid, 100, 100, bl2, 0);
+
+    // clone it
+    object_t oid2;
+    oid2 = oid;
+    oid2.rev = 1;
+    fs.clone(oid, oid2, 0);
+
+    // ... 
+    if (0) {
+      // make sure partial still behaves after orig is removed...
+      fs.remove(oid, 0);
+
+      // or i read for oid2...
+      bufferlist rbl;
+      fs.read(oid2, 0, 200, rbl);
+    }
+    if (1) {
+      // make sure things behave if we remove the clone
+      fs.remove(oid2,0);
+    }
+  }
+  // /explicit tests
+
+  list<Tester*> ls;
+  for (int i=0; i<threads; i++) {
+    Tester *t = new Tester(fs);
+    t->create();
+    ls.push_back(t);
+  }
+
+  utime_t now = g_clock.now();
+  utime_t dur(seconds,0);
+  utime_t end = now + dur;
+  cout << "stop at " << end << endl;
+  while (now < end) {
+    sleep(1);
+    now = g_clock.now();
+    cout << now << endl;
+  }
+
+  cout << "stopping" << endl;
+  stop = true;
+  
+  while (!ls.empty()) {
+    Tester *t = ls.front();
+    ls.pop_front();
+    t->join();
+    delete t;
+  }
+
+  fs.umount();
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/ebofs/types.h b/branches/sage/cephmds2/ebofs/types.h
new file mode 100644
index 0000000000000..1b85d138ec342
--- /dev/null
+++ b/branches/sage/cephmds2/ebofs/types.h
@@ -0,0 +1,168 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __EBOFS_TYPES_H
+#define __EBOFS_TYPES_H
+
+#include <cassert>
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/Cond.h"
+
+#include <ext/hash_map>
+#include <set>
+#include <list>
+#include <vector>
+using namespace std;
+using namespace __gnu_cxx;
+
+
+#include "include/object.h"
+
+
+#ifndef MIN
+# define MIN(a,b)  ((a)<=(b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b)  ((a)>=(b) ? (a):(b))
+#endif
+
+
+/*
+namespace __gnu_cxx {
+  template<> struct hash<unsigned long long> {
+    size_t operator()(unsigned long long __x) const { 
+      static hash<unsigned long> H;
+      return H((__x >> 32) ^ (__x & 0xffffffff)); 
+    }
+  };
+  
+  template<> struct hash< std::string >
+  {
+    size_t operator()( const std::string& x ) const
+    {
+      static hash<const char*> H;
+      return H(x.c_str());
+    }
+  };
+}
+*/
+
+
+// disk
+typedef __uint64_t block_t;        // disk location/sector/block
+
+static const int EBOFS_BLOCK_SIZE = 4096;
+static const int EBOFS_BLOCK_BITS = 12;    // 1<<12 == 4096
+
+class Extent {
+ public:
+  block_t start, length;
+
+  Extent() : start(0), length(0) {}
+  Extent(block_t s, block_t l) : start(s), length(l) {}
+
+  block_t last() const { return start + length - 1; }
+  block_t end() const { return start + length; }
+};
+
+inline ostream& operator<<(ostream& out, Extent& ex)
+{
+  return out << ex.start << "~" << ex.length;
+}
+
+
+// tree/set nodes
+typedef int    nodeid_t;
+
+static const int EBOFS_NODE_BLOCKS = 1;
+static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE;
+static const int EBOFS_MAX_NODE_REGIONS = 10;   // pick a better value!
+
+struct ebofs_nodepool {
+  Extent node_usemap_even;   // for even sb versions
+  Extent node_usemap_odd;    // for odd sb versions
+  
+  int    num_regions;
+  Extent region_loc[EBOFS_MAX_NODE_REGIONS];
+};
+
+
+// objects
+
+typedef __uint64_t coll_t;
+
+struct ebofs_onode {
+  Extent     onode_loc;       /* this is actually the block we live in */
+
+  object_t   object_id;       /* for kicks */
+  off_t      object_size;     /* file size in bytes.  should this be 64-bit? */
+  unsigned   object_blocks;
+  bool       readonly;
+  
+  int        num_collections;
+  int        num_attr;        // num attr in onode
+  int        num_extents;     /* number of extents used.  if 0, data is in the onode */
+};
+
+struct ebofs_cnode {
+  Extent     cnode_loc;       /* this is actually the block we live in */
+  coll_t     coll_id;
+  int        num_attr;        // num attr in cnode
+};
+
+
+// table
+struct ebofs_table {
+  nodeid_t root;      /* root node of btree */
+  int      num_keys;
+  int      depth;
+};
+
+
+// super
+typedef __uint64_t version_t;
+
+static const unsigned EBOFS_MAGIC = 0x000EB0F5;
+
+static const int EBOFS_NUM_FREE_BUCKETS = 5;   /* see alloc.h for bucket constraints */
+static const int EBOFS_FREE_BUCKET_BITS = 2;
+
+
+struct ebofs_super {
+  unsigned s_magic;
+  
+  unsigned epoch;             // version of this superblock.
+
+  unsigned num_blocks;        /* # blocks in filesystem */
+
+  // some basic stats, for kicks
+  unsigned free_blocks;       /* unused blocks */
+  unsigned limbo_blocks;      /* limbo blocks */
+  //unsigned num_objects;
+  //unsigned num_fragmented;
+  
+  struct ebofs_nodepool nodepool;
+  
+  // tables
+  struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS];  
+  struct ebofs_table limbo_tab;
+  struct ebofs_table alloc_tab;
+  struct ebofs_table object_tab;      // object directory
+  struct ebofs_table collection_tab;  // collection directory
+  struct ebofs_table co_tab;
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/fakefuse.cc b/branches/sage/cephmds2/fakefuse.cc
new file mode 100644
index 0000000000000..f021d83bac035
--- /dev/null
+++ b/branches/sage/cephmds2/fakefuse.cc
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/Monitor.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "common/Timer.h"
+
+#include "msg/FakeMessenger.h"
+
+
+
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+
+class C_Test : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test->finish(" << r << ")" << endl;
+  }
+};
+class C_Test2 : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test2->finish(" << r << ")" << endl;
+    g_timer.add_event_after(2, new C_Test);
+  }
+};
+
+
+
+int main(int argc, char **argv) {
+  cerr << "fakefuse starting" << endl;
+
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+  parse_config_options(args);
+
+  // start messenger thread
+  fakemessenger_startthread();
+
+  //g_timer.add_event_after(5.0, new C_Test2);
+  //g_timer.add_event_after(10.0, new C_Test);
+
+  vector<char*> nargs;
+  for (unsigned i=0; i<args.size(); i++) {
+    nargs.push_back(args[i]);
+  }
+  args = nargs;
+  vec_to_argv(args, argc, argv);
+
+  MonMap *monmap = new MonMap(g_conf.num_mon);
+  
+  Monitor *mon[g_conf.num_mon];
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)), monmap);
+  }
+
+  // create osd
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)), monmap);
+  }
+
+  // create mds
+  MDS *mds[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    mds[i] = new MDS(i, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
+  }
+ 
+    // init
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i]->init();
+  }
+  for (int i=0; i<NUMMDS; i++) {
+    mds[i]->init();
+  }
+  
+  for (int i=0; i<NUMOSD; i++) {
+    osd[i]->init();
+  }
+
+
+  // create client
+  Client *client[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(0)), monmap);
+    client[i]->init();
+
+
+    // start up fuse
+    // use my argc, argv (make sure you pass a mount point!)
+    cout << "starting fuse on pid " << getpid() << endl;
+    client[i]->mount();
+    ceph_fuse_main(client[i], argc, argv);
+    client[i]->unmount();
+    cout << "fuse finished on pid " << getpid() << endl;
+    client[i]->shutdown();
+  }
+  
+
+
+  // wait for it to finish
+  cout << "DONE -----" << endl;
+  fakemessenger_wait();  // blocks until messenger stops
+  
+
+  // cleanup
+  for (int i=0; i<NUMMDS; i++) {
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    delete client[i];
+  }
+  
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/fakemon.cc b/branches/sage/cephmds2/fakemon.cc
new file mode 100644
index 0000000000000..28cad55c071ca
--- /dev/null
+++ b/branches/sage/cephmds2/fakemon.cc
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+
+#include "client/SyntheticClient.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test->finish(" << r << ")" << endl;
+  }
+};
+
+
+int main(int argc, char **argv) 
+{
+  cerr << "fakesyn start" << endl;
+
+  //cerr << "inode_t " << sizeof(inode_t) << endl;
+
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+
+  parse_config_options(args);
+
+  int start = 0;
+
+  parse_syn_options(args);
+
+  vector<char*> nargs;
+
+  for (unsigned i=0; i<args.size(); i++) {
+    // unknown arg, pass it on.
+    cerr << " stray arg " << args[i] << endl;
+    nargs.push_back(args[i]);
+  }
+  assert(nargs.empty());
+
+
+  MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+
+  char hostname[100];
+  gethostname(hostname,100);
+  //int pid = getpid();
+
+  // create mon
+  Monitor *mon[g_conf.num_mon];
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)));
+  }
+
+  // create mds
+  MDS *mds[NUMMDS];
+  OSD *mdsosd[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    mds[i] = new MDS(mdc, i, new FakeMessenger(MSG_ADDR_MDS(i)));
+    if (g_conf.mds_local_osd)
+      mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)));
+    start++;
+  }
+  
+  // create osd
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    //cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)));
+    start++;
+  }
+  
+  // create client
+  Client *client[NUMCLIENT];
+  SyntheticClient *syn[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    //cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)));
+    start++;
+  }
+
+
+  // start message loop
+  fakemessenger_startthread();
+  
+  // init
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i]->init();
+  }
+  for (int i=0; i<NUMMDS; i++) {
+    mds[i]->init();
+    if (g_conf.mds_local_osd)
+      mdsosd[i]->init();
+  }
+  
+  for (int i=0; i<NUMOSD; i++) {
+    osd[i]->init();
+  }
+
+  
+  // create client(s)
+  for (int i=0; i<NUMCLIENT; i++) {
+    client[i]->init();
+    
+    // use my argc, argv (make sure you pass a mount point!)
+    //cout << "mounting" << endl;
+    client[i]->mount();
+    
+    //cout << "starting synthetic client  " << endl;
+    syn[i] = new SyntheticClient(client[i]);
+
+    syn[i]->start_thread();
+  }
+
+
+  for (int i=0; i<NUMCLIENT; i++) {
+    
+    cout << "waiting for synthetic client " << i << " to finish" << endl;
+    syn[i]->join_thread();
+    delete syn[i];
+    
+    client[i]->unmount();
+    //cout << "unmounted" << endl;
+    client[i]->shutdown();
+  }
+  
+        
+  // wait for it to finish
+  fakemessenger_wait();
+  
+  // cleanup
+  for (int i=0; i<NUMMDS; i++) {
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    delete client[i];
+  }
+  delete mdc;
+
+  cout << "fakesyn done" << endl;
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/fakesyn.cc b/branches/sage/cephmds2/fakesyn.cc
new file mode 100644
index 0000000000000..312ad8b345ef8
--- /dev/null
+++ b/branches/sage/cephmds2/fakesyn.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+
+#include "client/SyntheticClient.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test->finish(" << r << ")" << endl;
+  }
+};
+
+
+int main(int argc, char **argv) 
+{
+  cerr << "fakesyn start" << endl;
+
+  //cerr << "inode_t " << sizeof(inode_t) << endl;
+
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+
+  parse_config_options(args);
+
+  int start = 0;
+
+  parse_syn_options(args);
+
+  vector<char*> nargs;
+
+  for (unsigned i=0; i<args.size(); i++) {
+    // unknown arg, pass it on.
+    cerr << " stray arg " << args[i] << endl;
+    nargs.push_back(args[i]);
+  }
+  assert(nargs.empty());
+
+
+  g_clock.tare();
+
+  MonMap *monmap = new MonMap(g_conf.num_mon);
+
+  char hostname[100];
+  gethostname(hostname,100);
+  //int pid = getpid();
+
+  // create mon
+  Monitor *mon[g_conf.num_mon];
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)), monmap);
+  }
+
+  // create mds
+  MDS *mds[NUMMDS];
+  OSD *mdsosd[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    mds[i] = new MDS(i, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
+    if (g_conf.mds_local_osd)
+      mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)), monmap);
+    start++;
+  }
+  
+  // create osd
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    //cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)), monmap);
+    start++;
+  }
+  
+  // create client
+  Client *client[NUMCLIENT];
+  SyntheticClient *syn[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    //cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+    client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)), monmap);
+    start++;
+  }
+
+
+  // start message loop
+  fakemessenger_startthread();
+  
+  // init
+  for (int i=0; i<g_conf.num_mon; i++) {
+    mon[i]->init();
+  }
+  for (int i=0; i<NUMMDS; i++) {
+    mds[i]->init();
+    if (g_conf.mds_local_osd)
+      mdsosd[i]->init();
+  }
+  
+  for (int i=0; i<NUMOSD; i++) {
+    osd[i]->init();
+  }
+
+  
+  // create client(s)
+  for (int i=0; i<NUMCLIENT; i++) {
+    client[i]->init();
+    
+    // use my argc, argv (make sure you pass a mount point!)
+    //cout << "mounting" << endl;
+    client[i]->mount();
+    
+    //cout << "starting synthetic client  " << endl;
+    syn[i] = new SyntheticClient(client[i]);
+
+    syn[i]->start_thread();
+  }
+
+
+  for (int i=0; i<NUMCLIENT; i++) {
+    
+    cout << "waiting for synthetic client " << i << " to finish" << endl;
+    syn[i]->join_thread();
+    delete syn[i];
+    
+    client[i]->unmount();
+    //cout << "unmounted" << endl;
+    client[i]->shutdown();
+  }
+  
+        
+  // wait for it to finish
+  fakemessenger_wait();
+  
+  // cleanup
+  for (int i=0; i<NUMMDS; i++) {
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    delete client[i];
+  }
+
+  cout << "fakesyn done" << endl;
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/include/Context.h b/branches/sage/cephmds2/include/Context.h
new file mode 100644
index 0000000000000..78059b8d39d82
--- /dev/null
+++ b/branches/sage/cephmds2/include/Context.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __CONTEXT_H
+#define __CONTEXT_H
+
+#include "config.h"
+
+#include <assert.h>
+#include <list>
+#include <set>
+
+#include <iostream>
+
+
+/*
+ * Context - abstract callback class
+ */
+class Context {
+ public:
+  virtual ~Context() {}       // we want a virtual destructor!!!
+  virtual void finish(int r) = 0;
+};
+
+
+/*
+ * finish and destroy a list of Contexts
+ */
+inline void finish_contexts(std::list<Context*>& finished, 
+                            int result = 0)
+{
+  using std::cout;
+  using std::endl;
+  
+  if (finished.empty()) return;
+
+  dout(10) << finished.size() << " contexts to finish with " << result << endl;
+  for (std::list<Context*>::iterator it = finished.begin(); 
+       it != finished.end(); 
+       it++) {
+    Context *c = *it;
+    dout(10) << "---- " << c << endl;
+    c->finish(result);
+    delete c;
+  }
+}
+
+/*
+ * C_Contexts - set of Contexts
+ */
+class C_Contexts : public Context {
+  std::list<Context*> clist;
+  
+public:
+  void add(Context* c) {
+    clist.push_back(c);
+  }
+  void take(std::list<Context*>& ls) {
+    clist.splice(clist.end(), ls);
+  }
+  void finish(int r) {
+    finish_contexts(clist, r);
+  }
+};
+
+
+/*
+ * C_Gather
+ *
+ * BUG: does not report errors.
+ */
+class C_Gather : public Context {
+public:
+  class C_GatherSub : public Context {
+    C_Gather *gather;
+    int num;
+  public:
+    C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {}
+    void finish(int r) {
+      gather->finish(num);
+    }
+  };
+
+private:
+  Context *onfinish;
+  std::set<int> waitfor;
+  int num;
+
+public:
+  C_Gather(Context *f) : onfinish(f), num(0) {}
+
+  void finish(int r) {
+    assert(waitfor.count(r));
+    waitfor.erase(r);
+    if (waitfor.empty()) {
+      onfinish->finish(0);
+      delete onfinish;
+    }
+  }
+
+  Context *new_sub() {
+    num++;
+    waitfor.insert(num);
+    return new C_GatherSub(this, num);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/include/Distribution.h b/branches/sage/cephmds2/include/Distribution.h
new file mode 100644
index 0000000000000..00f352d59efab
--- /dev/null
+++ b/branches/sage/cephmds2/include/Distribution.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __DISTRIBUTION_H
+#define __DISTRIBUTION_H
+
+#include <cassert>
+#include <vector>
+using namespace std;
+
+class Distribution {
+  vector<float> p;
+  vector<int> v;
+
+ public:
+  //Distribution() { 
+  //}
+  
+  unsigned get_width() {
+    return p.size();
+  }
+
+  void clear() {
+    p.clear();
+    v.clear();
+  }
+  void add(int val, float pr) {
+    p.push_back(pr);
+    v.push_back(val);
+  }
+
+  void random() {
+    float sum = 0.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      p[i] = (float)(rand() % 10000);
+      sum += p[i];
+    }
+    for (unsigned i=0; i<p.size(); i++) 
+      p[i] /= sum;
+  }
+
+  int sample() {
+    float s = (float)(rand() % 10000) / 10000.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      if (s < p[i]) return v[i];
+      s -= p[i];
+    }
+    assert(0);
+    return v[p.size() - 1];  // hmm.  :/
+  }
+
+  float normalize() {
+    float s = 0.0;
+    for (unsigned i=0; i<p.size(); i++)
+      s += p[i];
+    for (unsigned i=0; i<p.size(); i++)
+      p[i] /= s;
+    return s;
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/include/buffer.h b/branches/sage/cephmds2/include/buffer.h
new file mode 100644
index 0000000000000..46008de6fc646
--- /dev/null
+++ b/branches/sage/cephmds2/include/buffer.h
@@ -0,0 +1,999 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __BUFFER_H
+#define __BUFFER_H
+
+#include "common/Mutex.h"
+
+#include <iostream>
+#include <list>
+
+using std::cout;
+using std::endl;
+
+#ifndef __CYGWIN__
+# include <sys/mman.h>
+#endif
+
+#define BUFFER_PAGE_SIZE 4096  // fixme.
+
+// <hack>
+//  these are in config.o
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+// </hack>
+
+class buffer {
+private:
+  
+  /* hack for memory utilization debugging. */
+  static void inc_total_alloc(unsigned len) {
+    bufferlock.Lock();
+    buffer_total_alloc += len;
+    bufferlock.Unlock();
+  }
+  static void dec_total_alloc(unsigned len) {
+    bufferlock.Lock();
+    buffer_total_alloc -= len;
+    bufferlock.Unlock();
+  }
+
+  /*
+   * an abstract raw buffer.  with a reference count.
+   */
+  class raw {
+  public:
+    char *data;
+    unsigned len;
+    int nref;
+    Mutex lock;  // we'll make it non-recursive.
+
+    raw(unsigned l) : len(l), nref(0), lock(false) {}
+    raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {}
+    virtual ~raw() {};
+
+    // no copying.
+    raw(const raw &other);
+    const raw& operator=(const raw &other);
+
+    virtual raw* clone_empty() = 0;
+    raw *clone() {
+      raw *c = clone_empty();
+      memcpy(c->data, data, len);
+      return c;
+    }
+  };
+
+  friend std::ostream& operator<<(std::ostream& out, const raw &r);
+
+  /*
+   * primitive buffer types
+   */
+  class raw_char : public raw {
+  public:
+    raw_char(unsigned l) : raw(l) {
+      data = new char[len];
+      inc_total_alloc(len);
+    }
+    ~raw_char() {
+      delete[] data;
+      dec_total_alloc(len);      
+    }
+    raw* clone_empty() {
+      return new raw_char(len);
+    }
+  };
+
+  class raw_static : public raw {
+  public:
+    raw_static(const char *d, unsigned l) : raw((char*)d, l) { }
+    ~raw_static() {}
+    raw* clone_empty() {
+      return new raw_char(len);
+    }
+  };
+
+#ifndef __CYGWIN__
+  class raw_mmap_pages : public raw {
+  public:
+    raw_mmap_pages(unsigned l) : raw(l) {
+      data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+      inc_total_alloc(len);
+    }
+    ~raw_mmap_pages() {
+      ::munmap(data, len);
+      dec_total_alloc(len);
+    }
+    raw* clone_empty() {
+      return new raw_mmap_pages(len);
+    }
+  };
+
+  class raw_posix_aligned : public raw {
+  public:
+    raw_posix_aligned(unsigned l) : raw(l) {
+      ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len);
+      inc_total_alloc(len);
+    }
+    ~raw_posix_aligned() {
+      ::free((void*)data);
+      dec_total_alloc(len);
+    }
+    raw* clone_empty() {
+      return new raw_posix_aligned(len);
+    }
+  };
+#endif
+
+#ifdef __CYGWIN__
+  class raw_hack_aligned : public raw {
+    char *realdata;
+  public:
+    raw_hack_aligned(unsigned l) : raw(l) {
+      realdata = new char[len+4095];
+      unsigned off = ((unsigned)realdata) % 4096;
+      if (off) 
+	data = realdata + 4096 - off;
+      else
+	data = realdata;
+      inc_total_alloc(len+4095);
+      //cout << "hack aligned " << (unsigned)data 
+      //<< " in raw " << (unsigned)realdata
+      //<< " off " << off << endl;
+      assert(((unsigned)data & 4095) == 0);
+    }
+    ~raw_hack_aligned() {
+      delete[] realdata;
+      dec_total_alloc(len+4095);
+    }
+    raw* clone_empty() {
+      return new raw_hack_aligned(len);
+    }
+  };
+#endif
+
+public:
+
+  /*
+   * named constructors 
+   */
+
+  static raw* copy(const char *c, unsigned len) {
+    raw* r = new raw_char(len);
+    memcpy(r->data, c, len);
+    return r;
+  }
+  static raw* create(unsigned len) {
+    return new raw_char(len);
+  }
+
+  static raw* create_page_aligned(unsigned len) {
+#ifndef __CYGWIN__
+    return new raw_mmap_pages(len);
+#else
+    return new raw_hack_aligned(len);
+#endif
+  }
+  
+  
+  /*
+   * a buffer pointer.  references (a subsequence of) a raw buffer.
+   */
+  class ptr {
+    raw *_raw;
+    unsigned _off, _len;
+
+  public:
+    ptr() : _raw(0), _off(0), _len(0) {}
+    ptr(raw *r) : _raw(r), _off(0), _len(r->len) {   // no lock needed; this is an unref raw.
+      ++r->nref;
+    }
+    ptr(unsigned l) : _off(0), _len(l) {
+      _raw = create(l);
+      ++_raw->nref;
+    }
+    ptr(char *d, unsigned l) : _off(0), _len(l) {    // ditto.
+      _raw = copy(d, l);
+      ++_raw->nref;
+    }
+    ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) {
+      if (_raw) {
+	_raw->lock.Lock();
+	++_raw->nref;
+	_raw->lock.Unlock();
+      }
+    }
+    ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) {
+      assert(o+l <= p._len);
+      assert(_raw);
+      _raw->lock.Lock();
+      ++_raw->nref;
+      _raw->lock.Unlock();
+    }
+    ptr& operator= (const ptr& p) {
+      // be careful -- we need to properly handle self-assignment.
+      if (p._raw) {
+	p._raw->lock.Lock();
+	++p._raw->nref;                              // inc new
+	p._raw->lock.Unlock();
+      }
+      release();                                 // dec (+ dealloc) old (if any)
+      _raw = p._raw;                               // change my ref
+      _off = p._off;
+      _len = p._len;
+      return *this;
+    }
+    ~ptr() {
+      release();
+    }
+
+    void release() {
+      if (_raw) {
+	_raw->lock.Lock();
+	if (--_raw->nref == 0) {
+	  //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl;
+	  _raw->lock.Unlock();	  
+	  delete _raw;  // dealloc old (if any)
+	} else
+	  _raw->lock.Unlock();	  
+	_raw = 0;
+      }
+    }
+
+    // misc
+    bool at_buffer_head() const { return _off == 0; }
+    bool at_buffer_tail() const { return _off + _len == _raw->len; }
+
+    // accessors
+    const char *c_str() const { assert(_raw); return _raw->data + _off; }
+    char *c_str() { assert(_raw); return _raw->data + _off; }
+    unsigned length() const { return _len; }
+    unsigned offset() const { return _off; }
+    unsigned unused_tail_length() const { return _raw->len - (_off+_len); }
+    const char& operator[](unsigned n) const { 
+      assert(_raw); 
+      assert(n < _len);
+      return _raw->data[_off + n];
+    }
+    char& operator[](unsigned n) { 
+      assert(_raw); 
+      assert(n < _len);
+      return _raw->data[_off + n];
+    }
+
+    const char *raw_c_str() const { assert(_raw); return _raw->data; }
+    unsigned raw_length() const { assert(_raw); return _raw->len; }
+    int raw_nref() const { assert(_raw); return _raw->nref; }
+
+    void copy_out(unsigned o, unsigned l, char *dest) const {
+      assert(_raw);
+      assert(o >= 0 && o <= _len);
+      assert(l >= 0 && o+l <= _len);
+      memcpy(dest, c_str()+o, l);
+    }
+
+    unsigned wasted() {
+      assert(_raw);
+      return _raw->len - _len;
+    }
+
+    // modifiers
+    void set_offset(unsigned o) { _off = o; }
+    void set_length(unsigned l) { _len = l; }
+
+    void append(const char *p, unsigned l) {
+      assert(_raw);
+      assert(l <= unused_tail_length());
+      memcpy(c_str() + _len, p, l);
+      _len += l;
+    }
+
+    void copy_in(unsigned o, unsigned l, const char *src) {
+      assert(_raw);
+      assert(o >= 0 && o <= _len);
+      assert(l >= 0 && o+l <= _len);
+      memcpy(c_str()+o, src, l);
+    }
+
+    void zero() {
+      memset(c_str(), 0, _len);
+    }
+
+    void clean() {
+      //raw *newraw = _raw->makesib(_len);
+    }
+  };
+
+  friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+  /*
+   * list - the useful bit!
+   */
+
+  class list {
+    // my private bits
+    std::list<ptr> _buffers;
+    unsigned _len;
+
+  public:
+    // cons/des
+    list() : _len(0) {}
+    list(const list& other) : _buffers(other._buffers), _len(other._len) { }
+    list(unsigned l) : _len(0) {
+      ptr bp(l);
+      push_back(bp);
+    }
+    ~list() {}
+    
+    list& operator= (const list& other) {
+      _buffers = other._buffers;
+      _len = other._len;
+      return *this;
+    }
+
+    const std::list<ptr>& buffers() const { return _buffers; }
+    
+    unsigned length() const {
+#if 0
+      // DEBUG: verify _len
+      unsigned len = 0;
+      for (std::list<ptr>::iterator it = _buffers.begin();
+	   it != _buffers.end();
+	   it++) {
+	len += (*it).length();
+      }
+      assert(len == _len);
+#endif
+      return _len;
+    }
+
+
+    // modifiers
+    void clear() {
+      _buffers.clear();
+      _len = 0;
+    }
+    void push_front(ptr& bp) {
+      _buffers.push_front(bp);
+      _len += bp.length();
+    }
+    void push_front(raw *r) {
+      ptr bp(r);
+      _buffers.push_front(bp);
+      _len += bp.length();
+    }
+    void push_back(ptr& bp) {
+      _buffers.push_back(bp);
+      _len += bp.length();
+    }
+    void push_back(raw *r) {
+      ptr bp(r);
+      _buffers.push_back(bp);
+      _len += bp.length();
+    }
+    void zero() {
+      for (std::list<ptr>::iterator it = _buffers.begin();
+	   it != _buffers.end();
+	   it++)
+        it->zero();
+    }
+
+    // sort-of-like-assignment-op
+    void claim(list& bl) {
+      // free my buffers
+      clear();
+      claim_append(bl);
+    }
+    void claim_append(list& bl) {
+      // steal the other guy's buffers
+      _len += bl._len;
+      _buffers.splice( _buffers.end(), bl._buffers );
+      bl._len = 0;
+    }
+    
+    // crope lookalikes
+    void copy(unsigned off, unsigned len, char *dest) {
+      assert(off >= 0);
+      assert(off + len <= length());
+      /*assert(off < length());
+	if (off + len > length()) 
+	len = length() - off;
+      */
+      // advance to off
+      std::list<ptr>::iterator curbuf = _buffers.begin();
+      
+      // skip off
+      while (off > 0) {
+	assert(curbuf != _buffers.end());
+	if (off >= (*curbuf).length()) {
+	  // skip this buffer
+	  off -= (*curbuf).length();
+	  curbuf++;
+	} else {
+	  // somewhere in this buffer!
+	  break;
+	}
+      }
+      
+      // copy
+      while (len > 0) {
+	// is the rest ALL in this buffer?
+	if (off + len <= (*curbuf).length()) {
+	  (*curbuf).copy_out(off, len, dest);        // yup, last bit!
+	  break;
+	}
+	
+	// get as much as we can from this buffer.
+	unsigned howmuch = (*curbuf).length() - off;
+	(*curbuf).copy_out(off, howmuch, dest);
+	
+	dest += howmuch;
+	len -= howmuch;
+	off = 0;
+	curbuf++;
+	assert(curbuf != _buffers.end());
+      }
+    }
+    
+    void copy_in(unsigned off, unsigned len, const char *src) {
+      assert(off >= 0);
+      assert(off + len <= length());
+      
+      // advance to off
+      std::list<ptr>::iterator curbuf = _buffers.begin();
+      
+      // skip off
+      while (off > 0) {
+	assert(curbuf != _buffers.end());
+	if (off >= (*curbuf).length()) {
+	  // skip this buffer
+	  off -= (*curbuf).length();
+	  curbuf++;
+	} else {
+	  // somewhere in this buffer!
+	  break;
+	}
+      }
+      
+      // copy
+      while (len > 0) {
+	// is the rest ALL in this buffer?
+	if (off + len <= (*curbuf).length()) {
+	  (*curbuf).copy_in(off, len, src);        // yup, last bit!
+	  break;
+	}
+	
+	// get as much as we can from this buffer.
+	unsigned howmuch = (*curbuf).length() - off;
+	(*curbuf).copy_in(off, howmuch, src);
+	
+	src += howmuch;
+	len -= howmuch;
+	off = 0;
+	curbuf++;
+	assert(curbuf != _buffers.end());
+      }
+    }
+    void copy_in(unsigned off, unsigned len, const list& bl) {
+      unsigned left = len;
+      for (std::list<ptr>::const_iterator i = bl._buffers.begin();
+	   i != bl._buffers.end();
+	   i++) {
+	unsigned l = (*i).length();
+	if (left < l) l = left;
+	copy_in(off, l, (*i).c_str());
+	left -= l;
+	if (left == 0) break;
+	off += l;
+      }
+    }
+
+
+    void append(const char *data, unsigned len) {
+      if (len == 0) return;
+      
+      unsigned alen = 0;
+      
+      // copy into the tail buffer?
+      if (!_buffers.empty()) {
+	unsigned avail = _buffers.back().unused_tail_length();
+	if (avail > 0) {
+	  //std::cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf " << _buffers.back() << std::endl;
+	  if (avail > len) 
+	    avail = len;
+	  _buffers.back().append(data, avail);
+	  _len += avail;
+	  data += avail;
+	  len -= avail;
+	}
+	alen = _buffers.back().length();
+      }
+      if (len == 0) return;
+      
+      // just add another buffer.
+      // alloc a bit extra, in case we do a bunch of appends.   FIXME be smarter!
+      if (alen < 4096) alen = 4096;
+      ptr bp = create(alen);
+      bp.set_length(len);
+      bp.copy_in(0, len, data);
+      push_back(bp);
+    }
+    void append(ptr& bp) {
+      push_back(bp);
+    }
+    void append(ptr& bp, unsigned off, unsigned len) {
+      assert(len+off <= bp.length());
+      ptr tempbp(bp, off, len);
+      push_back(tempbp);
+    }
+    void append(const list& bl) {
+      list temp(bl);         // copy list
+      claim_append(temp);    // and append
+    }
+    
+    
+    /*
+     * get a char
+     */
+    const char& operator[](unsigned n) {
+      assert(n < _len);
+      for (std::list<ptr>::iterator p = _buffers.begin();
+	   p != _buffers.end();
+	   p++) {
+	if (n >= p->length()) {
+	  n -= p->length();
+	  continue;
+	}
+	return (*p)[n];
+      }
+      assert(0);
+    }
+
+    /*
+     * return a contiguous ptr to whole bufferlist contents.
+     */
+    char *c_str() {
+      if (_buffers.size() == 1) {
+	return _buffers.front().c_str();  // good, we're already contiguous.
+      }
+      else if (_buffers.size() == 0) {
+	return 0;                         // no buffers
+      } 
+      else {
+	ptr newbuf = create(length());	     // make one new contiguous buffer.
+	copy(0, length(), newbuf.c_str());   // copy myself into it.
+	clear();
+	push_back(newbuf);
+	return newbuf.c_str();	// now it'll work.
+      }
+    }
+
+    void substr_of(list& other, unsigned off, unsigned len) {
+      assert(off + len <= other.length());
+      clear();
+      
+      // skip off
+      std::list<ptr>::iterator curbuf = other._buffers.begin();
+      while (off > 0) {
+	assert(curbuf != _buffers.end());
+	if (off >= (*curbuf).length()) {
+	  // skip this buffer
+	  //cout << "skipping over " << *curbuf << endl;
+	  off -= (*curbuf).length();
+	  curbuf++;
+	} else {
+	  // somewhere in this buffer!
+	  //cout << "somewhere in " << *curbuf << endl;
+	  break;
+	}
+      }
+      
+      while (len > 0) {
+	// partial?
+	if (off + len < (*curbuf).length()) {
+	  //cout << "copying partial of " << *curbuf << endl;
+	  _buffers.push_back( ptr( *curbuf, off, len ) );
+	  _len += len;
+	  break;
+	}
+	
+	// through end
+	//cout << "copying end (all?) of " << *curbuf << endl;
+	unsigned howmuch = (*curbuf).length() - off;
+	_buffers.push_back( ptr( *curbuf, off, howmuch ) );
+	_len += howmuch;
+	len -= howmuch;
+	off = 0;
+	curbuf++;
+      }
+    }
+
+
+    // funky modifer
+    void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) {    // fixme?
+      assert(off < length()); 
+      assert(len > 0);
+      //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+      
+      // skip off
+      std::list<ptr>::iterator curbuf = _buffers.begin();
+      while (off > 0) {
+	assert(curbuf != _buffers.end());
+	if (off >= (*curbuf).length()) {
+	  // skip this buffer
+	  //cout << "off = " << off << " skipping over " << *curbuf << endl;
+	  off -= (*curbuf).length();
+	  curbuf++;
+	} else {
+	  // somewhere in this buffer!
+	  //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+	  break;
+	}
+      }
+      assert(off >= 0);
+      
+      if (off) {
+	// add a reference to the front bit
+	//  insert it before curbuf (which we'll hose)
+	//cout << "keeping front " << off << " of " << *curbuf << endl;
+	_buffers.insert( curbuf, ptr( *curbuf, 0, off ) );
+	_len += off;
+      }
+      
+      while (len > 0) {
+	// partial?
+	if (off + len < (*curbuf).length()) {
+	  //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+	  if (claim_by) 
+	    claim_by->append( *curbuf, off, len );
+	  (*curbuf).set_offset( off+len + (*curbuf).offset() );    // ignore beginning big
+	  (*curbuf).set_length( (*curbuf).length() - (len+off) );
+	  _len -= off+len;
+	  //cout << " now " << *curbuf << endl;
+	  break;
+	}
+	
+	// hose though the end
+	unsigned howmuch = (*curbuf).length() - off;
+	//cout << "discarding " << howmuch << " of " << *curbuf << endl;
+	if (claim_by) 
+	  claim_by->append( *curbuf, off, howmuch );
+	_len -= (*curbuf).length();
+	_buffers.erase( curbuf++ );
+	len -= howmuch;
+	off = 0;
+      }
+      
+      // splice in *replace (implement me later?)
+    }
+
+  };
+
+};
+
+typedef buffer::ptr bufferptr;
+typedef buffer::list bufferlist;
+
+
+inline bool operator>(bufferlist& l, bufferlist& r) {
+  for (unsigned p = 0; ; p++) {
+    if (l.length() > p && r.length() == p) return true;
+    if (l.length() == p) return false;
+    if (l[p] > r[p]) return true;
+    if (l[p] < r[p]) return false;
+    p++;
+  }
+}
+inline bool operator>=(bufferlist& l, bufferlist& r) {
+  for (unsigned p = 0; ; p++) {
+    if (l.length() > p && r.length() == p) return true;
+    if (r.length() == p && l.length() == p) return true;
+    if (l[p] > r[p]) return true;
+    if (l[p] < r[p]) return false;
+    p++;
+  }
+}
+inline bool operator<(bufferlist& l, bufferlist& r) {
+  return r > l;
+}
+inline bool operator<=(bufferlist& l, bufferlist& r) {
+  return r >= l;
+}
+
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) {
+  return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")";
+}
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
+  out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+      << " " << (void*)bp.c_str() 
+      << " in raw " << (void*)bp.raw_c_str()
+      << " len " << bp.raw_length()
+      << " nref " << bp.raw_nref() << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
+  out << "buffer::list(len=" << bl.length() << "," << std::endl;
+
+  std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
+  while (it != bl.buffers().end()) {
+    out << "\t" << *it;
+    if (++it == bl.buffers().end()) break;
+    out << "," << std::endl;
+  }
+  out << std::endl << ")";
+  return out;
+}
+
+
+
+
+// encoder/decode helpers
+
+// string
+inline void _encode(const std::string& s, bufferlist& bl) 
+{
+  bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(std::string& s, bufferlist& bl, int& off)
+{
+  s = bl.c_str() + off;
+  off += s.length() + 1;
+}
+
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl) 
+{
+  size_t len = bp.length();
+  bl.append((char*)&len, sizeof(len));
+  bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+  size_t len;
+  bl.copy(off, sizeof(len), (char*)&len);
+  off += sizeof(len);
+  bufferlist s;
+  s.substr_of(bl, off, len);
+  off += len;
+
+  if (s.buffers().size() == 1)
+    bp = s.buffers().front();
+  else
+    bp = buffer::copy(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl) 
+{
+  size_t len = s.length();
+  bl.append((char*)&len, sizeof(len));
+  bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+  size_t len;
+  bl.copy(off, sizeof(len), (char*)&len);
+  off += sizeof(len);
+  s.substr_of(bl, off, len);
+  off += len;
+}
+
+#include <set>
+#include <map>
+#include <vector>
+#include <string>
+
+// set<T>
+template<class T>
+inline void _encode(std::set<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename std::set<T>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(std::set<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s.insert(v);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// vector<T>
+template<class T>
+inline void _encode(std::vector<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename std::vector<T>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(std::vector<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  s = std::vector<T>(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s[i] = v;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const std::list<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename std::list<T>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(std::list<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s.push_back(v);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+
+// map<string,bufferptr>
+inline void _encode(std::map<std::string, bufferptr>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (std::map<std::string, bufferptr>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    _encode(it->first, bl);
+    _encode(it->second, bl);
+    n--;
+  }
+  assert(n==0);
+}
+inline void _decode(std::map<std::string,bufferptr>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    std::string k;
+    _decode(k, bl, off);
+    _decode(s[k], bl, off);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const std::map<T, bufferlist>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  //std::cout << "n = " << n << std::endl;
+  for (typename std::map<T, bufferlist>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T k = it->first;
+    bl.append((char*)&k, sizeof(k));
+    _encode(it->second, bl);
+    n--;
+    //std::cout << "--n = " << n << " after k " << k << std::endl;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(std::map<T,bufferlist>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T k;
+    bl.copy(off, sizeof(k), (char*)&k);
+    off += sizeof(k);
+    bufferlist b;
+    _decode(b, bl, off);
+    s[k] = b;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// map<T,U>
+template<class T, class U>
+inline void _encode(const std::map<T, U>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename std::map<T, U>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T k = it->first;
+    U v = it->second;
+    bl.append((char*)&k, sizeof(k));
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T, class U>
+inline void _decode(std::map<T,U>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T k;
+    U v;
+    bl.copy(off, sizeof(k), (char*)&k);
+    off += sizeof(k);
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s[k] = v;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/include/error.h b/branches/sage/cephmds2/include/error.h
new file mode 100644
index 0000000000000..da469e5843d28
--- /dev/null
+++ b/branches/sage/cephmds2/include/error.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <stdarg.h>
+
+#ifdef    __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+  ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef    __cplusplus
+} // extern "C"
+#endif
diff --git a/branches/sage/cephmds2/include/filepath.h b/branches/sage/cephmds2/include/filepath.h
new file mode 100644
index 0000000000000..5585e536b42db
--- /dev/null
+++ b/branches/sage/cephmds2/include/filepath.h
@@ -0,0 +1,206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __FILEPATH_H
+#define __FILEPATH_H
+
+
+/*
+ * BUG:  /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ *   -> should it be different?  how?  should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iostream>
+#include <string>
+#include <vector>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "buffer.h"
+
+
+class filepath {
+  string path;
+  vector<string> bits;
+
+  void rebuild() {
+    if (absolute()) 
+      path = "/";
+    else 
+      path.clear();
+    for (unsigned i=0; i<bits.size(); i++) {
+      if (i) path += "/";
+      path += bits[i];
+    }
+  }
+  void parse() {
+    bits.clear();
+    int off = 0;
+    while (off < (int)path.length()) {
+      // skip trailing/duplicate slash(es)
+      int nextslash = path.find('/', off);
+      if (nextslash == off) {
+        off++;
+        continue;
+      }
+      if (nextslash < 0) 
+        nextslash = path.length();  // no more slashes
+      
+      bits.push_back( path.substr(off,nextslash-off) );
+      off = nextslash+1;
+    }
+  }
+
+ public:
+  filepath() {}
+  filepath(const string& s) {
+    set_path(s);
+  }
+  filepath(const char* s) {
+    set_path(s);
+  }
+
+  bool absolute() { return path[0] == '/'; }
+  bool relative() { return !absolute(); }
+  
+  void set_path(const string& s) {
+    path = s;
+    parse();
+  }
+  void set_path(const char *s) {
+    path = s;
+    parse();
+  }
+
+  string& get_path() {
+    return path;
+  }
+  int length() const {
+    return path.length();
+  }
+
+  const char *c_str() const {
+    return path.c_str();
+  }
+
+
+  filepath prefixpath(int s) const {
+    filepath t;
+    for (int i=0; i<s; i++)
+      t.add_dentry(bits[i]);
+    return t;
+  }
+  filepath postfixpath(int s) const {
+    filepath t;
+    for (unsigned i=s; i<bits.size(); i++)
+      t.add_dentry(bits[i]);
+    return t;
+  }
+  void add_dentry(const string& s) {
+    bits.push_back(s);
+    if (path.length())
+      path += "/";
+    path += s;
+  }
+  void append(const filepath& a) {
+    for (unsigned i=0; i<a.depth(); i++) 
+      add_dentry(a[i]);
+  }
+
+  void pop_dentry() {
+    bits.pop_back();
+    rebuild();
+  }    
+    
+
+
+  void clear() {
+    path = "";
+    bits.clear();
+  }
+
+  const string& operator[](int i) const {
+    return bits[i];
+  }
+
+  const string& last_bit() const {
+    return bits[ bits.size()-1 ];
+  }
+
+  unsigned depth() const {
+    return bits.size();
+  }
+  bool empty() {
+    return bits.size() == 0;
+  }
+
+  
+  void _rope(crope& r) {
+    char n = bits.size();
+    r.append((char*)&n, sizeof(char));
+    for (vector<string>::iterator it = bits.begin();
+         it != bits.end();
+         it++) { 
+      r.append((*it).c_str(), (*it).length()+1);
+    }
+  }
+
+  void _unrope(crope& r, int& off) {
+    clear();
+
+    char n;
+    r.copy(off, sizeof(char), (char*)&n);
+    off += sizeof(char);
+    for (int i=0; i<n; i++) {
+      string s = r.c_str() + off;
+      off += s.length() + 1;
+      add_dentry(s);
+    }
+  }
+
+  void _encode(bufferlist& bl) {
+    char n = bits.size();
+    bl.append((char*)&n, sizeof(char));
+    for (vector<string>::iterator it = bits.begin();
+         it != bits.end();
+         it++) { 
+      bl.append((*it).c_str(), (*it).length()+1);
+    }
+  }
+
+  void _decode(bufferlist& bl, int& off) {
+    clear();
+
+    char n;
+    bl.copy(off, sizeof(char), (char*)&n);
+    off += sizeof(char);
+    for (int i=0; i<n; i++) {
+      string s = bl.c_str() + off;
+      off += s.length() + 1;
+      add_dentry(s);
+    }
+  }
+
+};
+
+inline ostream& operator<<(ostream& out, filepath& path)
+{
+  return out << path.get_path();
+}
+
+#endif
diff --git a/branches/sage/cephmds2/include/interval_set.h b/branches/sage/cephmds2/include/interval_set.h
new file mode 100644
index 0000000000000..69cd798c8e2ce
--- /dev/null
+++ b/branches/sage/cephmds2/include/interval_set.h
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __INTERVAL_SET_H
+#define __INTERVAL_SET_H
+
+#include <map>
+#include <ostream>
+#include <cassert>
+using namespace std;
+
+#ifndef MIN
+# define MIN(a,b)  ((a)<=(b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b)  ((a)>=(b) ? (a):(b))
+#endif
+
+
+template<typename T>
+class interval_set {
+ public:
+  map<T,T> m;   // map start -> len  
+
+  // helpers
+ private:
+  typename map<T,T>::const_iterator find_inc(T start) const {
+    typename map<T,T>::const_iterator p = m.lower_bound(start);  // p->first >= start
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might overlap?
+      if (p->first + p->second <= start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename map<T,T>::iterator find_inc_m(T start) {
+    typename map<T,T>::iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might overlap?
+      if (p->first + p->second <= start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename map<T,T>::const_iterator find_adj(T start) const {
+    typename map<T,T>::const_iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might touch?
+      if (p->first + p->second < start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename map<T,T>::iterator find_adj_m(T start) {
+    typename map<T,T>::iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might touch?
+      if (p->first + p->second < start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+ public:
+  bool operator==(const interval_set& other) const {
+    return m == other.m;
+  }
+
+  void clear() {
+    m.clear();
+  }
+
+  bool contains(T i) const {
+    typename map<T,T>::const_iterator p = find_inc(i);
+    if (p == m.end()) return false;
+    if (p->first > i) return false;
+    if (p->first+p->second <= i) return false;
+    assert(p->first <= i && p->first+p->second > i);
+    return true;
+  }
+  bool contains(T start, T len) const {
+    typename map<T,T>::const_iterator p = find_inc(start);
+    if (p == m.end()) return false;
+    if (p->first > start) return false;
+    if (p->first+p->second <= start) return false;
+    assert(p->first <= start && p->first+p->second > start);
+    if (p->first+p->second < start+len) return false;
+    return true;
+  }
+  bool intersects(T start, T len) const {
+    interval_set a;
+    a.insert(start, len);
+    interval_set i;
+    i.intersection_of( *this, a );
+    if (i.empty()) return false;
+    return true;
+  }
+
+  // outer range of set
+  bool empty() const {
+    return m.empty();
+  }
+  T start() const {
+    assert(!empty());
+    typename map<T,T>::const_iterator p = m.begin();
+    return p->first;
+  }
+  T end() const {
+    assert(!empty());
+    typename map<T,T>::const_iterator p = m.end();
+    p--;
+    return p->first+p->second;
+  }
+
+  // interval start after p (where p not in set)
+  bool starts_after(T i) const {
+    assert(!contains(i));
+    typename map<T,T>::const_iterator p = find_inc(i);
+    if (p == m.end()) return false;
+    return true;
+  }
+  T start_after(T i) const {
+    assert(!contains(i));
+    typename map<T,T>::const_iterator p = find_inc(i);
+    return p->first;
+  }
+
+  // interval end that contains start
+  T end_after(T start) const {
+    assert(contains(start));
+    typename map<T,T>::const_iterator p = find_inc(start);
+    return p->first+p->second;
+  }
+  
+  void insert(T val) {
+    insert(val, 1);
+  }
+
+  void insert(T start, T len) {
+    //cout << "insert " << start << "~" << len << endl;
+    assert(len > 0);
+    typename map<T,T>::iterator p = find_adj_m(start);
+    if (p == m.end()) {
+      m[start] = len;                  // new interval
+    } else {
+      if (p->first < start) {
+        
+        if (p->first + p->second != start) {
+          //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+          assert(0);
+        }
+        
+        assert(p->first + p->second == start);
+        p->second += len;               // append to end
+        
+        typename map<T,T>::iterator n = p;
+        n++;
+        if (n != m.end() && 
+            start+len == n->first) {   // combine with next, too!
+          p->second += n->second;
+          m.erase(n);
+        }
+      } else {
+        if (start+len == p->first) {
+          m[start] = len + p->second;  // append to front 
+          m.erase(p);
+        } else {
+          assert(p->first > start+len);
+          m[start] = len;              // new interval
+        }
+      }
+    }
+  }
+  
+  void erase(T val) {
+    erase(val, 1);
+  }
+
+  void erase(T start, T len) {
+    typename map<T,T>::iterator p = find_inc_m(start);
+
+    assert(p != m.end());
+    assert(p->first <= start);
+
+    T before = start - p->first;
+    assert(p->second >= before+len);
+    T after = p->second - before - len;
+    
+    if (before) 
+      p->second = before;        // shorten bit before
+    else
+      m.erase(p);
+    if (after)
+      m[start+len] = after;
+  }
+
+
+  void subtract(const interval_set &a) {
+    for (typename map<T,T>::const_iterator p = a.m.begin();
+         p != a.m.end();
+         p++)
+      erase(p->first, p->second);
+  }
+
+  void insert(const interval_set &a) {
+    for (typename map<T,T>::const_iterator p = a.m.begin();
+         p != a.m.end();
+         p++)
+      insert(p->first, p->second);
+  }
+
+
+  void intersection_of(const interval_set &a, const interval_set &b) {
+    assert(&a != this);
+    assert(&b != this);
+    clear();
+
+    typename map<T,T>::const_iterator pa = a.m.begin();
+    typename map<T,T>::const_iterator pb = b.m.begin();
+    
+    while (pa != a.m.end() && pb != b.m.end()) {
+      // passing?
+      if (pa->first + pa->second <= pb->first) 
+        { pa++;  continue; }
+      if (pb->first + pb->second <= pa->first) 
+        { pb++;  continue; }
+      T start = MAX(pa->first, pb->first);
+      T end = MIN(pa->first+pa->second, pb->first+pb->second);
+      assert(end > start);
+      insert(start, end-start);
+      if (pa->first+pa->second > pb->first+pb->second)
+        pb++;
+      else
+        pa++; 
+    }
+  }
+
+  void union_of(const interval_set &a, const interval_set &b) {
+    assert(&a != this);
+    assert(&b != this);
+    clear();
+    
+    //cout << "union_of" << endl;
+
+    // a
+    m = a.m;
+
+    // - (a*b)
+    interval_set ab;
+    ab.intersection_of(a, b);
+    subtract(ab);
+
+    // + b
+    insert(b);
+    return;
+  }
+  void union_of(const interval_set &b) {
+    interval_set a;
+    a.m.swap(m);
+    union_of(a, b);
+  }
+
+  bool subset_of(const interval_set &big) const {
+    for (typename map<T,T>::const_iterator i = m.begin();
+         i != m.end();
+         i++) 
+      if (!big.contains(i->first, i->second)) return false;
+    return true;
+  }  
+  
+};
+
+template<class T>
+inline ostream& operator<<(ostream& out, const interval_set<T> &s) {
+  out << "[";
+  for (typename map<T,T>::const_iterator i = s.m.begin();
+       i != s.m.end();
+       i++) {
+    if (i != s.m.begin()) out << ",";
+    out << i->first << "~" << i->second;
+  }
+  out << "]";
+  return out;
+}
+
+
+#endif
diff --git a/branches/sage/cephmds2/include/lru.h b/branches/sage/cephmds2/include/lru.h
new file mode 100644
index 0000000000000..63096d0e32079
--- /dev/null
+++ b/branches/sage/cephmds2/include/lru.h
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __LRU_H
+#define __LRU_H
+
+#include <assert.h>
+#include <iostream>
+using namespace std;
+
+#include "config.h"
+
+
+
+class LRUObject {
+ private:
+  LRUObject *lru_next, *lru_prev;
+  bool lru_pinned;
+  class LRU *lru;
+  class LRUList *lru_list;
+
+ public:
+  LRUObject() {
+    lru_next = lru_prev = NULL;
+    lru_list = 0;
+    lru_pinned = false;
+    lru = 0;
+  }
+
+  // pin/unpin item in cache
+  void lru_pin(); 
+  void lru_unpin();
+  bool lru_is_expireable() { return !lru_pinned; }
+
+  friend class LRU;
+  friend class LRUList;
+};
+
+
+class LRUList {
+ private:
+  LRUObject *head, *tail;
+  __uint32_t len;
+
+ public:
+  LRUList() {
+    head = tail = 0;
+    len = 0;
+  }
+  
+  __uint32_t  get_length() { return len; }
+
+  LRUObject *get_head() {
+    return head;
+  }
+  LRUObject *get_tail() {
+    return tail;
+  }
+
+  void insert_head(LRUObject *o) {
+    o->lru_next = head;
+    o->lru_prev = NULL;
+    if (head) {
+      head->lru_prev = o;
+    } else {
+      tail = o;
+    }
+    head = o;
+    o->lru_list = this;
+    len++;
+  }
+  void insert_tail(LRUObject *o) {
+    o->lru_next = NULL;
+    o->lru_prev = tail;
+    if (tail) {
+      tail->lru_next = o;
+    } else {
+      head = o;
+    }
+    tail = o;
+    o->lru_list = this;
+    len++;
+  }
+
+  void remove(LRUObject *o) {
+    assert(o->lru_list == this);
+    if (o->lru_next)
+      o->lru_next->lru_prev = o->lru_prev;
+    else
+      tail = o->lru_prev;
+    if (o->lru_prev)
+      o->lru_prev->lru_next = o->lru_next;
+    else
+      head = o->lru_next;
+    o->lru_next = o->lru_prev = NULL;
+    o->lru_list = 0;
+    assert(len>0);
+    len--;
+  }
+  
+};
+
+
+class LRU {
+ protected:
+  LRUList lru_top, lru_bot, lru_pintail;
+  __uint32_t lru_num, lru_num_pinned;
+  __uint32_t lru_max;   // max items
+  double lru_midpoint;
+
+  friend class LRUObject;
+  //friend class MDCache; // hack
+  
+ public:
+  LRU(int max = 0) {
+    lru_num = 0;
+    lru_num_pinned = 0;
+    lru_midpoint = .9;
+    lru_max = max;
+  }
+
+  __uint32_t lru_get_size() { return lru_num; }
+  __uint32_t lru_get_top() { return lru_top.get_length(); }
+  __uint32_t lru_get_bot() { return lru_bot.get_length(); }
+  __uint32_t lru_get_pintail() { return lru_pintail.get_length(); }
+  __uint32_t lru_get_max() { return lru_max; }
+  __uint32_t lru_get_num_pinned() { return lru_num_pinned; }
+
+  void lru_set_max(__uint32_t m) { lru_max = m; }
+  void lru_set_midpoint(float f) { lru_midpoint = f; }
+  
+
+  // insert at top of lru
+  void lru_insert_top(LRUObject *o) {
+    //assert(!o->lru_in_lru);
+    //o->lru_in_lru = true;
+    assert(!o->lru);
+    o->lru = this;
+    lru_top.insert_head( o );
+    lru_num++;
+    if (o->lru_pinned) lru_num_pinned++;
+    lru_adjust();
+  }
+
+  // insert at mid point in lru
+  void lru_insert_mid(LRUObject *o) {
+    //assert(!o->lru_in_lru);
+    //o->lru_in_lru = true;
+    assert(!o->lru);
+    o->lru = this;
+    lru_bot.insert_head(o);
+    lru_num++;
+    if (o->lru_pinned) lru_num_pinned++;
+  }
+
+  // insert at bottom of lru
+  void lru_insert_bot(LRUObject *o) {
+    assert(!o->lru);
+    o->lru = this;
+    lru_bot.insert_tail(o);
+    lru_num++;
+    if (o->lru_pinned) lru_num_pinned++;
+  }
+
+  /*
+  // insert at bottom of lru
+  void lru_insert_pintail(LRUObject *o) {
+    assert(!o->lru);
+    o->lru = this;
+    
+    assert(o->lru_pinned);
+
+    lru_pintail.insert_head(o);
+    lru_num++;
+    lru_num_pinned += o->lru_pinned;
+  }
+  */
+
+  
+
+
+  // adjust top/bot balance, as necessary
+  void lru_adjust() {
+    if (!lru_max) return;
+
+    unsigned toplen = lru_top.get_length();
+    unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max);
+    while (toplen > 0 && 
+           toplen > topwant) {
+      // remove from tail of top, stick at head of bot
+      // FIXME: this could be way more efficient by moving a whole chain of items.
+
+      LRUObject *o = lru_top.get_tail();
+      lru_top.remove(o);
+      lru_bot.insert_head(o);
+      toplen--;
+    }
+  }
+
+
+  // remove an item
+  LRUObject *lru_remove(LRUObject *o) {
+    // not in list
+    //assert(o->lru_in_lru);
+    //if (!o->lru_in_lru) return o;  // might have expired and been removed that way.
+    if (!o->lru) return o;
+
+
+    if (o->lru_list == &lru_top)
+      lru_top.remove(o);
+    else if (o->lru_list == &lru_bot) 
+      lru_bot.remove(o);
+    else if (o->lru_list == &lru_pintail)
+      lru_pintail.remove(o);
+    else
+      assert(0);
+
+    lru_num--;
+    if (o->lru_pinned) lru_num_pinned--;
+    o->lru = 0;
+    return o;
+  }
+
+  // touch item -- move to head of lru
+  bool lru_touch(LRUObject *o) {
+    lru_remove(o);
+    lru_insert_top(o);
+    return true;
+  }
+
+  // touch item -- move to midpoint (unless already higher)
+  bool lru_midtouch(LRUObject *o) {
+    if (o->lru_list == &lru_top) return false;
+    
+    lru_remove(o);
+    lru_insert_mid(o);
+    return true;
+  }
+
+  // touch item -- move to bottom
+  bool lru_bottouch(LRUObject *o) {
+    lru_remove(o);
+    lru_insert_bot(o);
+    return true;
+  }
+
+
+  // expire -- expire a single item
+  LRUObject *lru_get_next_expire() {
+    LRUObject *p;
+    
+    // look through tail of bot
+    while (lru_bot.get_length()) {
+      p = lru_bot.get_tail();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      lru_bot.remove(p);
+      lru_pintail.insert_head(p);
+    }
+
+    // ok, try head then
+    while (lru_top.get_length()) {
+      p = lru_top.get_tail();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      lru_top.remove(p);
+      lru_pintail.insert_head(p);
+    }
+    
+    // no luck!
+    return NULL;
+  }
+  
+  LRUObject *lru_expire() {
+    LRUObject *p = lru_get_next_expire();
+    if (p) 
+      return lru_remove(p);
+    return NULL;
+  }
+
+
+  void lru_status() {
+    dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl;
+  }
+
+};
+
+
+inline void LRUObject::lru_pin() 
+{
+  lru_pinned = true;
+  if (lru) lru->lru_num_pinned++;
+}
+inline void LRUObject::lru_unpin() {
+  lru_pinned = false;
+  if (lru) {
+    lru->lru_num_pinned--;
+
+    // move from pintail -> bot
+    if (lru_list == &lru->lru_pintail) {
+      lru->lru_pintail.remove(this);
+      lru->lru_bot.insert_tail(this);
+    }
+  }
+}
+
+#endif
diff --git a/branches/sage/cephmds2/include/object.h b/branches/sage/cephmds2/include/object.h
new file mode 100644
index 0000000000000..3a66c4ab83d54
--- /dev/null
+++ b/branches/sage/cephmds2/include/object.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __OBJECT_H
+#define __OBJECT_H
+
+#include <iostream>
+#include <iomanip>
+using namespace std;
+
+
+typedef __uint32_t objectrev_t;
+
+struct object_t {
+  static const __uint32_t MAXREV = 0xffffffffU;
+
+  __uint64_t ino;  // "file" identifier
+  __uint32_t bno;  // "block" in that "file"
+  objectrev_t rev; // revision.  normally ctime (as epoch).
+
+  object_t() : ino(0), bno(0), rev(0) {}
+  object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {}
+};
+
+
+inline bool operator==(const object_t l, const object_t r) {
+  return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev);
+}
+inline bool operator!=(const object_t l, const object_t r) {
+  return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev);
+}
+inline bool operator>(const object_t l, const object_t r) {
+  if (l.ino > r.ino) return true;
+  if (l.ino < r.ino) return false;
+  if (l.bno > r.bno) return true;
+  if (l.bno < r.bno) return false;
+  if (l.rev > r.rev) return true;
+  return false;
+}
+inline bool operator<(const object_t l, const object_t r) {
+  if (l.ino < r.ino) return true;
+  if (l.ino > r.ino) return false;
+  if (l.bno < r.bno) return true;
+  if (l.bno > r.bno) return false;
+  if (l.rev < r.rev) return true;
+  return false;
+}
+inline bool operator>=(const object_t l, const object_t r) { 
+  return !(l < r);
+}
+inline bool operator<=(const object_t l, const object_t r) {
+  return !(l > r);
+}
+inline ostream& operator<<(ostream& out, const object_t o) {
+  out << hex << o.ino << '.';
+  out.setf(ios::right);
+  out.fill('0');
+  out << setw(8) << o.bno << dec;
+  out.unsetf(ios::right);
+  if (o.rev) 
+    out << '.' << o.rev;
+  return out;
+}
+namespace __gnu_cxx {
+  template<> struct hash<__uint64_t> {
+    size_t operator()(__uint64_t __x) const { 
+      static hash<__uint32_t> H;
+      return H((__x >> 32) ^ (__x & 0xffffffff)); 
+    }
+  };
+
+  template<> struct hash<object_t> {
+    size_t operator()(const object_t &r) const { 
+      static hash<__uint64_t>  H;
+      static hash<__uint32_t> I;
+      return H(r.ino) ^ I(r.bno);
+    }
+  };
+}
+
+#endif
diff --git a/branches/sage/cephmds2/include/oldbuffer.h b/branches/sage/cephmds2/include/oldbuffer.h
new file mode 100644
index 0000000000000..fda7336bc6461
--- /dev/null
+++ b/branches/sage/cephmds2/include/oldbuffer.h
@@ -0,0 +1,357 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __BUFFER_H
+#define __BUFFER_H
+
+#include <cassert>
+#include <string.h>
+
+#include <iostream>
+using namespace std;
+
+// bit masks
+#define BUFFER_MODE_NOCOPY 0
+#define BUFFER_MODE_COPY   1    // copy on create, my buffer
+
+#define BUFFER_MODE_NOFREE 0
+#define BUFFER_MODE_FREE   2
+
+#define BUFFER_MODE_CUSTOMFREE 4
+
+#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE)
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+#include "common/Mutex.h"
+
+// HACK: in config.cc
+/*
+ * WARNING: bufferlock placements are tricky for efficiency.  note that only bufferptr and
+ * buffer ever use buffer._ref, and only bufferptr should call ~buffer().
+ *
+ * So, I only need to protect:
+ *  - buffer()'s modification of buffer_total_alloc
+ *  - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc
+ * 
+ * I don't protect
+ *  - buffer._get() .. increment is atomic on any sane architecture
+ *  - buffer._put() .. only called by ~bufferptr.
+ *  - ~buffer       .. only called by ~bufferptr   *** I HOPE!!  
+ */
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+
+
+typedef void (buffer_free_func_t)(void*,char*,unsigned);
+
+
+/*
+ * buffer  - the underlying buffer container.  with a reference count.
+ * 
+ * the buffer never shrinks.
+ *
+ * some invariants:
+ *  _len never shrinks
+ *  _len <= _alloc_len
+ */
+class buffer {
+ protected:
+  //wtf
+  //static Mutex bufferlock;
+  //static long buffer_total_alloc;// = 0;
+
+ private:
+  // raw buffer alloc
+  char *_dataptr;
+  bool _myptr;
+  unsigned _len;
+  unsigned _alloc_len;
+
+  // ref counts
+  unsigned _ref;
+  int _get() { 
+    bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl;
+    return ++_ref;
+  }
+  int _put() { 
+    bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl;
+    assert(_ref > 0);
+    return --_ref;
+  }
+
+  // custom (de!)allocator
+  buffer_free_func_t *free_func;
+  void *free_func_arg;
+  
+  friend class bufferptr;
+
+ public:
+  // constructors
+  buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { 
+    bdbout(1) << "buffer.cons " << *this << endl;
+  }
+  buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) {
+    bdbout(1) << "buffer.cons " << *this << endl;
+    _dataptr = new char[a];
+    bufferlock.Lock();
+    buffer_total_alloc += _alloc_len;
+    bufferlock.Unlock();
+    bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+  }
+  ~buffer() {
+    bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl;
+    if (free_func) {
+      bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl;
+      free_func( free_func_arg, _dataptr, _alloc_len );
+    }
+    else if (_dataptr && _myptr) {
+      bdbout(1) << "buffer.free " << (void*)_dataptr << endl;
+      delete[] _dataptr;
+      buffer_total_alloc -= _alloc_len;
+    }
+  }
+  
+  buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0,
+         buffer_free_func_t free_func=0, void* free_func_arg=0) : 
+    _dataptr(0), 
+    _myptr(false),
+    _len(l), 
+    _ref(0), 
+    free_func(0), free_func_arg(0) {
+    
+    if (alloc_len) 
+      _alloc_len = alloc_len;
+    else
+      _alloc_len = l;
+
+    _myptr = mode & BUFFER_MODE_FREE ? true:false;
+    bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl;
+    if (mode & BUFFER_MODE_COPY) {
+      _dataptr = new char[_alloc_len];
+      bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+      bufferlock.Lock();
+      buffer_total_alloc += _alloc_len;
+      bufferlock.Unlock();
+      memcpy(_dataptr, p, l);
+      bdbout(1) << "buffer.copy " << *this << endl;
+    } else {
+      _dataptr = (char*)p;                              // ugly
+      bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl;
+    }
+
+    if (mode & BUFFER_MODE_CUSTOMFREE && free_func) {
+      this->free_func = free_func;
+      this->free_func_arg = free_func_arg;
+    }
+  }
+
+  // operators
+  buffer& operator=(buffer& other) {
+    assert(0);  // not implemented, no reasonable assignment semantics.
+    return *this;
+  }
+
+  char *c_str() {
+    return _dataptr;
+  }
+
+  bool has_free_func() { return free_func != 0; }
+  
+  // accessor
+  unsigned alloc_length() {
+    return _alloc_len;
+  }
+  void set_length(unsigned l) {
+    assert(l <= _alloc_len);
+    _len = l;
+  }
+  unsigned length() { return _len; }
+  unsigned unused_tail_length() { return _alloc_len - _len; }
+
+  friend ostream& operator<<(ostream& out, buffer& b);
+};
+
+inline ostream& operator<<(ostream& out, buffer& b) {
+  return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")";
+}
+
+
+/*
+ * smart pointer class for buffer
+ *
+ * we reference count the actual buffer.
+ * we also let you refer to a subset of a buffer.
+ * we implement the high-level buffer accessor methods.
+ *
+ * some invariants:
+ *  _off        <  _buffer->_len
+ *  _off + _len <= _buffer->_len
+ */
+class bufferptr {
+ private:
+  buffer *_buffer;
+  unsigned _len, _off;
+
+ public:
+  // empty cons
+  bufferptr() :
+    _buffer(0),
+    _len(0),
+    _off(0) { }
+  // main cons - the entire buffer
+  bufferptr(buffer *b) :
+    _buffer(b),
+    _len(b->_len),
+    _off(0) {
+    assert(_buffer->_ref == 0);
+    _buffer->_get();   // this is always the first one.
+  }
+  // subset cons - a subset of another bufferptr (subset)
+  bufferptr(const bufferptr& bp, unsigned len, unsigned off) {
+    bufferlock.Lock();
+    _buffer = bp._buffer;
+    _len = len;
+    _off = bp._off + off;
+    _buffer->_get();
+    assert(_off < _buffer->_len);          // sanity checks
+    assert(_off + _len <= _buffer->_len);
+    bufferlock.Unlock();
+  }
+
+  // copy cons
+  bufferptr(const bufferptr &other) {
+    bufferlock.Lock();
+    _buffer = other._buffer;
+    _len = other._len;
+    _off = other._off;
+    if (_buffer) _buffer->_get();    
+    bufferlock.Unlock();
+  }
+
+  // assignment operator
+  bufferptr& operator=(const bufferptr& other) {
+    //assert(0);
+    // discard old
+    discard_buffer();
+
+    // point to other
+    bufferlock.Lock();
+    _buffer = other._buffer;
+    _len = other._len;
+    _off = other._off;
+    if (_buffer) _buffer->_get();
+    bufferlock.Unlock();
+    return *this;
+  }
+
+  ~bufferptr() {
+    discard_buffer();
+  }
+
+  void discard_buffer() {
+    if (_buffer) {
+      bufferlock.Lock();
+      if (_buffer->_put() == 0) 
+        delete _buffer;
+      _buffer = 0;
+      bufferlock.Unlock();
+    }
+  }
+
+
+  // dereference to get the actual buffer
+  buffer& operator*() { 
+    return *_buffer;
+  }
+
+
+  bool at_buffer_head() const {
+    return _off == 0;
+  }
+  bool at_buffer_tail() const {
+    return _off + _len == _buffer->_len;
+  }
+
+  // accessors for my subset
+  char *c_str() {
+    return _buffer->c_str() + _off;
+  }
+  unsigned length() const {
+    return _len;
+  }
+  unsigned offset() const {
+    return _off;
+  }
+  unsigned unused_tail_length() {
+    if (!at_buffer_tail()) return 0;
+    return _buffer->unused_tail_length();
+  }
+
+
+
+  // modifiers
+  void set_offset(unsigned off) {
+    assert(off <= _buffer->_alloc_len);
+    _off = off;
+  }
+  void set_length(unsigned len) {
+    assert(len >= 0 && _off + len <= _buffer->_alloc_len);
+    if (_buffer->_len < _off + len) 
+      _buffer->_len = _off + len;    // set new buffer len (_IF_ i'm expanding it)
+    _len = len;                      // my len too
+  }
+  void zero() {
+      //bzero((void*)c_str(), _len);
+    memset((void*)c_str(), 0, _len);
+  }
+
+
+  // crope lookalikes
+  void append(const char *p, unsigned len) {
+    assert(len + _len + _off <= _buffer->_alloc_len);  // FIXME later for auto-expansion?
+
+    // copy
+    memcpy(c_str() + _len, p, len);
+    _buffer->_len += len;
+    _len += len;
+  }
+  void copy_out(unsigned off, unsigned len, char *dest) {
+    assert(off >= 0 && off <= _len);
+    assert(len >= 0 && off + len <= _len);
+    memcpy(dest, c_str() + off, len);
+  }
+  void copy_in(unsigned off, unsigned len, const char *src) {
+    assert(off >= 0 && off <= _len);
+    assert(len >= 0 && off + len <= _len);
+    memcpy(c_str() + off, src, len);
+  }
+
+  friend ostream& operator<<(ostream& out, bufferptr& bp);
+};
+
+
+inline ostream& operator<<(ostream& out, bufferptr& bp) {
+  return out << "bufferptr(len=" << bp._len << " off=" << bp._off 
+             << " cstr=" << (void*)bp.c_str()
+             << " buf=" << *bp._buffer 
+             << ")";
+}
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/include/oldbufferlist.h b/branches/sage/cephmds2/include/oldbufferlist.h
new file mode 100644
index 0000000000000..466a5ead25d77
--- /dev/null
+++ b/branches/sage/cephmds2/include/oldbufferlist.h
@@ -0,0 +1,681 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __BUFFERLIST_H
+#define __BUFFERLIST_H
+
+#include "buffer.h"
+
+#include <list>
+#include <map>
+#include <set>
+#include <vector>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+
+
+class bufferlist {
+ private:
+  /* local state limited to _buffers, and _len.
+   * we maintain _len ourselves, so we must be careful when fiddling with buffers!
+   */
+  list<bufferptr> _buffers;
+  unsigned _len;
+
+ public:
+  // cons/des
+  bufferlist() : _len(0) {
+    bdbout(1) << "bufferlist.cons " << this << endl;
+  }
+  bufferlist(const bufferlist& bl) : _len(0) {
+    //assert(0); // o(n) and stupid
+    bdbout(1) << "bufferlist.cons " << this << endl; 
+    _buffers = bl._buffers;
+    _len = bl._len;
+  }
+  ~bufferlist() {
+    bdbout(1) << "bufferlist.des " << this << endl;
+  }
+  
+  bufferlist& operator=(bufferlist& bl) {
+    //assert(0);  // actually, this should be fine, just slow (O(n)) and stupid.
+    bdbout(1) << "bufferlist.= " << this << endl; 
+    _buffers = bl._buffers;
+    _len = bl._len;
+    return *this;
+  }
+  
+
+  // accessors
+  list<bufferptr>& buffers() { 
+    return _buffers; 
+  }
+  //list<buffer*>::iterator begin() { return _buffers.begin(); }
+  //list<buffer*>::iterator end() { return _buffers.end(); }
+
+  unsigned length() const {
+#if 0
+    { // DEBUG: verify _len
+      int len = 0;
+      for (list<bufferptr>::iterator it = _buffers.begin();
+           it != _buffers.end();
+           it++) {
+        len += (*it).length();
+      }
+      assert(len == _len);
+    }
+#endif
+    return _len;
+  }
+
+  void _rope(crope& r) {
+    for (list<bufferptr>::iterator it = _buffers.begin();
+         it != _buffers.end();
+         it++)
+      r.append((*it).c_str(), (*it).length());
+  }
+  
+  // modifiers
+  void clear() {
+    _buffers.clear();
+    _len = 0;
+  }
+  void push_front(bufferptr& bp) {
+    _buffers.push_front(bp);
+    _len += bp.length();
+  }
+  void push_front(buffer *b) {
+    bufferptr bp(b);
+    _buffers.push_front(bp);
+    _len += bp.length();
+  }
+  void push_back(bufferptr& bp) {
+    _buffers.push_back(bp);
+    _len += bp.length();
+  }
+  void push_back(buffer *b) {
+    bufferptr bp(b);
+
+    _buffers.push_back(bp);
+    _len += bp.length();
+
+  }
+  void zero() {
+      for (list<bufferptr>::iterator it = _buffers.begin();
+         it != _buffers.end();
+         it++)
+        it->zero();
+    }
+
+  // sort-of-like-assignment-op
+  void claim(bufferlist& bl) {
+    // free my buffers
+    clear();
+    claim_append(bl);
+  }
+  void claim_append(bufferlist& bl) {
+    // steal the other guy's buffers
+    _len += bl._len;
+    _buffers.splice( _buffers.end(), bl._buffers );
+    bl._len = 0;
+  }
+
+
+
+  
+  // crope lookalikes
+  void copy(unsigned off, unsigned len, char *dest) {
+    assert(off >= 0);
+    assert(off + len <= length());
+    /*assert(off < length());
+    if (off + len > length()) 
+      len = length() - off;
+    */
+    // advance to off
+    list<bufferptr>::iterator curbuf = _buffers.begin();
+
+    // skip off
+    while (off > 0) {
+      assert(curbuf != _buffers.end());
+      if (off >= (*curbuf).length()) {
+        // skip this buffer
+        off -= (*curbuf).length();
+        curbuf++;
+      } else {
+        // somewhere in this buffer!
+        break;
+      }
+    }
+    
+    // copy
+    while (len > 0) {
+      // is the rest ALL in this buffer?
+      if (off + len <= (*curbuf).length()) {
+        (*curbuf).copy_out(off, len, dest);        // yup, last bit!
+        break;
+      }
+
+      // get as much as we can from this buffer.
+      unsigned howmuch = (*curbuf).length() - off;
+      (*curbuf).copy_out(off, howmuch, dest);
+
+      dest += howmuch;
+      len -= howmuch;
+      off = 0;
+      curbuf++;
+      assert(curbuf != _buffers.end());
+    }
+  }
+
+  void copy_in(unsigned off, unsigned len, const char *src) {
+    assert(off >= 0);
+    assert(off + len <= length());
+
+    // advance to off
+    list<bufferptr>::iterator curbuf = _buffers.begin();
+
+    // skip off
+    while (off > 0) {
+      assert(curbuf != _buffers.end());
+      if (off >= (*curbuf).length()) {
+        // skip this buffer
+        off -= (*curbuf).length();
+        curbuf++;
+      } else {
+        // somewhere in this buffer!
+        break;
+      }
+    }
+    
+    // copy
+    while (len > 0) {
+      // is the rest ALL in this buffer?
+      if (off + len <= (*curbuf).length()) {
+        (*curbuf).copy_in(off, len, src);        // yup, last bit!
+        break;
+      }
+
+      // get as much as we can from this buffer.
+      unsigned howmuch = (*curbuf).length() - off;
+      (*curbuf).copy_in(off, howmuch, src);
+
+      src += howmuch;
+      len -= howmuch;
+      off = 0;
+      curbuf++;
+      assert(curbuf != _buffers.end());
+    }
+  }
+  void copy_in(unsigned off, unsigned len, bufferlist& bl) {
+    unsigned left = len;
+    for (list<bufferptr>::iterator i = bl._buffers.begin();
+         i != bl._buffers.end();
+         i++) {
+      unsigned l = (*i).length();
+      if (left < l) l = left;
+      copy_in(off, l, (*i).c_str());
+      left -= l;
+      if (left == 0) break;
+      off += l;
+    }
+  }
+
+
+  void append(const char *data, unsigned len) {
+    if (len == 0) return;
+
+    unsigned alen = 0;
+    
+    // copy into the tail buffer?
+    if (!_buffers.empty()) {
+      unsigned avail = _buffers.back().unused_tail_length();
+      if (avail > 0) {
+        //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
+        if (avail > len) 
+          avail = len;
+        unsigned blen = _buffers.back().length();
+        memcpy(_buffers.back().c_str() + blen, data, avail);
+        blen += avail;
+        _buffers.back().set_length(blen);
+        _len += avail;
+        data += avail;
+        len -= avail;
+      }
+      alen = _buffers.back().length();
+    }
+    if (len == 0) return;
+
+    // just add another buffer.
+    // alloc a bit extra, in case we do a bunch of appends.   FIXME be smarter!
+    if (alen < 1024) alen = 1024;
+    push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen));  
+  }
+  void append(bufferptr& bp) {
+    push_back(bp);
+  }
+  void append(bufferptr& bp, unsigned len, unsigned off) {
+    bufferptr tempbp(bp, len, off);
+    push_back(tempbp);
+  }
+  void append(const bufferlist& bl) {
+    bufferlist temp = bl;  // copy list
+    claim_append(temp);    // and append
+  }
+  
+  
+  /*
+   * return a contiguous ptr to whole bufferlist contents.
+   */
+  char *c_str() {
+    if (_buffers.size() == 1) {
+      return _buffers.front().c_str();  // good, we're already contiguous.
+    }
+    else if (_buffers.size() == 0) {
+      return 0;                         // no buffers
+    } 
+    else {
+      // make one new contiguous buffer.
+      bufferptr newbuf = new buffer(length());
+      unsigned off = 0;
+
+      for (list<bufferptr>::iterator it = _buffers.begin();
+           it != _buffers.end();
+           it++) {
+        //assert((*(*it)).has_free_func() == false);     // not allowed if there's a funky free_func.. -sage   ...for debugging at least!
+        memcpy(newbuf.c_str() + off,
+               (*it).c_str(), (*it).length());
+        off += (*it).length();
+      }
+      assert(off == newbuf.length());
+      
+      _buffers.clear();
+      _buffers.push_back( newbuf );
+
+      // now it'll work.
+      return c_str();
+    }
+  }
+
+
+  void substr_of(bufferlist& other, unsigned off, unsigned len) {
+    assert(off + len <= other.length());
+    clear();
+
+    // skip off
+    list<bufferptr>::iterator curbuf = other._buffers.begin();
+    while (off > 0) {
+      assert(curbuf != _buffers.end());
+      if (off >= (*curbuf).length()) {
+        // skip this buffer
+        //cout << "skipping over " << *curbuf << endl;
+        off -= (*curbuf).length();
+        curbuf++;
+      } else {
+        // somewhere in this buffer!
+        //cout << "somewhere in " << *curbuf << endl;
+        break;
+      }
+    }
+    
+    while (len > 0) {
+      // partial?
+      if (off + len < (*curbuf).length()) {
+        //cout << "copying partial of " << *curbuf << endl;
+        _buffers.push_back( bufferptr( *curbuf, len, off ) );
+        _len += len;
+        break;
+      }
+
+      // through end
+      //cout << "copying end (all?) of " << *curbuf << endl;
+      unsigned howmuch = (*curbuf).length() - off;
+      _buffers.push_back( bufferptr( *curbuf, howmuch, off ) );
+      _len += howmuch;
+      len -= howmuch;
+      off = 0;
+      curbuf++;
+    }
+  }
+
+  // funky modifer
+  void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) {    // fixme?
+    assert(off < length()); 
+    assert(len > 0);
+    //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+
+    // skip off
+    list<bufferptr>::iterator curbuf = _buffers.begin();
+    while (off > 0) {
+      assert(curbuf != _buffers.end());
+      if (off >= (*curbuf).length()) {
+        // skip this buffer
+        //cout << "off = " << off << " skipping over " << *curbuf << endl;
+        off -= (*curbuf).length();
+        curbuf++;
+      } else {
+        // somewhere in this buffer!
+        //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+        break;
+      }
+    }
+    assert(off >= 0);
+
+    if (off) {
+      // add a reference to the front bit
+      //  insert it before curbuf (which we'll hose)
+      //cout << "keeping front " << off << " of " << *curbuf << endl;
+      _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) );
+      _len += off;
+    }
+
+    while (len > 0) {
+      // partial?
+      if (off + len < (*curbuf).length()) {
+        //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+        if (claim_by) 
+          claim_by->append( *curbuf, len, off );
+        (*curbuf).set_offset( off+len + (*curbuf).offset() );    // ignore beginning big
+        (*curbuf).set_length( (*curbuf).length() - (len+off) );
+        _len -= off+len;
+        //cout << " now " << *curbuf << endl;
+        break;
+      }
+
+      // hose though the end
+      unsigned howmuch = (*curbuf).length() - off;
+      //cout << "discarding " << howmuch << " of " << *curbuf << endl;
+      if (claim_by) 
+        claim_by->append( *curbuf, howmuch, off );
+      _len -= (*curbuf).length();
+      _buffers.erase( curbuf++ );
+      len -= howmuch;
+      off = 0;
+    }
+
+    // splice in *replace (implement me later?)
+  }
+
+  friend ostream& operator<<(ostream& out, bufferlist& bl);
+
+};
+
+inline ostream& operator<<(ostream& out, bufferlist& bl) {
+  out << "bufferlist(len=" << bl.length() << endl;
+  for (list<bufferptr>::iterator it = bl._buffers.begin();
+       it != bl._buffers.end();
+       it++) 
+    out << "\t" << *it << endl;
+  out << ")" << endl;
+  return out;
+}
+
+
+
+// encoder/decode helpers
+
+// string
+inline void _encode(const string& s, bufferlist& bl) 
+{
+  bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(string& s, bufferlist& bl, int& off)
+{
+  s = bl.c_str() + off;
+  off += s.length() + 1;
+}
+
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl) 
+{
+  size_t len = bp.length();
+  bl.append((char*)&len, sizeof(len));
+  bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+  size_t len;
+  bl.copy(off, sizeof(len), (char*)&len);
+  off += sizeof(len);
+  bufferlist s;
+  s.substr_of(bl, off, len);
+  off += len;
+
+  if (s.buffers().size() == 1)
+    bp = s.buffers().front();
+  else
+    bp = new buffer(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl) 
+{
+  size_t len = s.length();
+  bl.append((char*)&len, sizeof(len));
+  bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+  size_t len;
+  bl.copy(off, sizeof(len), (char*)&len);
+  off += sizeof(len);
+  s.substr_of(bl, off, len);
+  off += len;
+}
+
+
+// set<T>
+template<class T>
+inline void _encode(set<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename set<T>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(set<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s.insert(v);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// vector<T>
+template<class T>
+inline void _encode(vector<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename vector<T>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(vector<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  s = vector<T>(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s[i] = v;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const list<T>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename list<T>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T v = *it;
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(list<T>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T v;
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s.push_back(v);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// map<string,bufferptr>
+inline void _encode(map<string, bufferptr>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (map<string, bufferptr>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    _encode(it->first, bl);
+    _encode(it->second, bl);
+    n--;
+  }
+  assert(n==0);
+}
+inline void _decode(map<string,bufferptr>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    string k;
+    _decode(k, bl, off);
+    _decode(s[k], bl, off);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const map<T, bufferlist>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename map<T, bufferlist>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T k = it->first;
+    bl.append((char*)&k, sizeof(k));
+    _encode(it->second, bl);
+    n--;
+  }
+  assert(n==0);
+}
+template<class T>
+inline void _decode(map<T,bufferlist>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T k;
+    bl.copy(off, sizeof(k), (char*)&k);
+    off += sizeof(k);
+    bufferlist b;
+    _decode(b, bl, off);
+    s[k] = b;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+// map<T,U>
+template<class T, class U>
+inline void _encode(const map<T, U>& s, bufferlist& bl)
+{
+  int n = s.size();
+  bl.append((char*)&n, sizeof(n));
+  for (typename map<T, U>::const_iterator it = s.begin();
+       it != s.end();
+       it++) {
+    T k = it->first;
+    U v = it->second;
+    bl.append((char*)&k, sizeof(k));
+    bl.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+template<class T, class U>
+inline void _decode(map<T,U>& s, bufferlist& bl, int& off) 
+{
+  s.clear();
+  int n;
+  bl.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    T k;
+    U v;
+    bl.copy(off, sizeof(k), (char*)&k);
+    off += sizeof(k);
+    bl.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s[k] = v;
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/include/rangeset.h b/branches/sage/cephmds2/include/rangeset.h
new file mode 100644
index 0000000000000..d166bbaf23bbf
--- /dev/null
+++ b/branches/sage/cephmds2/include/rangeset.h
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __RANGESET_H
+#define __RANGESET_H
+
+/*
+ *
+ * my first container with iterator!   it's pretty ugly.
+ *
+ */
+
+#include <map>
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+  map<T,T> ranges;  // pair(first,last) (inclusive, e.g. [first,last])
+                    
+  typedef typename map<T,T>::iterator mapit;
+
+  // get iterator for range including val.  or ranges.end().
+  mapit get_range_for(T val) {
+    mapit it = ranges.lower_bound(val);
+    if (it == ranges.end()) {
+      // search backwards
+      typename map<T,T>::reverse_iterator it = ranges.rbegin();
+      if (it == ranges.rend()) return ranges.end();
+      if (it->first <= val && it->second >= val)
+        return ranges.find(it->first);
+      return ranges.end();
+    } else {
+      if (it->first == val) return 
+      it--;
+      if (it->first <= val && it->second >= val)
+        return it;
+      return ranges.end();
+    }
+  }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+  public std::iterator<std::input_iterator_tag, T>
+{
+  //typedef typename map<T,T>::iterator mapit;
+
+  map<T,T> ranges;
+  typename map<T,T>::iterator it;
+  T current;
+
+public:
+  // cons
+  rangeset_iterator() {}
+
+  rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+    this->ranges = ranges;
+    this->it = it;
+    if (this->it != ranges.end())
+      current = it->first;
+  }
+
+  bool operator==(rangeset_iterator<T> rit) {
+    return (it == rit.it && rit.current == current);
+  }
+  bool operator!=(rangeset_iterator<T> rit) {
+    return (it != rit.it) || (rit.current != current);
+  }
+  
+  T& operator*() {
+    return current;
+  }
+
+  rangeset_iterator<T> operator++(int) {
+    if (current < it->second)
+      current++;
+    else {
+      it++;
+      if (it != ranges.end())
+        current = it->first;
+    }
+    
+    return *this;
+  }
+};
+
+
+template <class T>
+class rangeset
+{
+  typedef typename map<T,T>::iterator map_iterator;
+
+  _rangeset_base<T> theset;
+  inodeno_t _size;
+
+public:
+  rangeset() { _size = 0; }
+  typedef rangeset_iterator<T> iterator;
+
+  iterator begin() {
+    map_iterator it = theset.ranges.begin();
+    return iterator(it, theset.ranges);
+  }
+
+  iterator end() {
+    map_iterator it = theset.ranges.end();
+    return iterator(it, theset.ranges);
+  }
+
+  map_iterator map_begin() {
+    return theset.ranges.begin();
+  }
+  map_iterator map_end() {
+    return theset.ranges.end();
+  }
+  int map_size() {
+    return theset.ranges.size();
+  }
+
+  void map_insert(T v1, T v2) {
+    theset.ranges.insert(pair<T,T>(v1,v2));
+    _size += v2 - v1+1;
+  }
+
+
+  // ...
+  bool contains(T val) {
+    if (theset.get_range_for(val) == theset.ranges.end()) return false;
+    assert(!empty());
+    return true;
+  }
+  
+  void insert(T val) {
+    assert(!contains(val));
+
+    map_iterator left = theset.get_range_for(val-1);
+    map_iterator right = theset.get_range_for(val+1);
+
+    if (left != theset.ranges.end() &&
+        right != theset.ranges.end()) {
+      // join!
+      left->second = right->second;
+      theset.ranges.erase(right);
+      _size++;
+      return;
+    }
+
+    if (left != theset.ranges.end()) {
+      // add to left range
+      left->second = val;
+      _size++;
+      return;
+    }
+
+    if (right != theset.ranges.end()) {
+      // add to right range
+      theset.ranges.insert(pair<T,T>(val, right->second));
+      theset.ranges.erase(val+1);
+      _size++;
+      return;
+    }
+
+    // new range
+    theset.ranges.insert(pair<T,T>(val,val));
+    _size++;
+    return;
+  }
+
+  unsigned size() {
+    return size();
+  }
+
+  bool empty() {
+    if (theset.ranges.empty()) {
+      assert(_size == 0);
+      return true;
+    }
+    assert(_size>0);
+    return false;
+  }
+
+  
+  T first() {
+    assert(!empty());
+    map_iterator it = theset.ranges.begin();
+    return it->first;
+  }
+  
+  void erase(T val) {
+    assert(contains(val));
+    map_iterator it = theset.get_range_for(val);
+    assert(it != theset.ranges.end());
+    
+    // entire range
+    if (val == it->first && val == it->second) {
+      theset.ranges.erase(it);
+      _size--;
+      return;
+    }
+
+    // beginning
+    if (val == it->first) {
+      theset.ranges.insert(pair<T,T>(val+1, it->second));
+      theset.ranges.erase(it);
+      _size--;
+      return;      
+    }
+
+    // end
+    if (val == it->second) {
+      it->second = val-1;
+      _size--;
+      return;
+    }
+
+    // middle split
+    theset.ranges.insert(pair<T,T>(it->first, val-1));
+    theset.ranges.insert(pair<T,T>(val+1, it->second));
+    theset.ranges.erase(it);
+    _size--;
+    return;
+  }
+
+  void dump() {
+    for (typename map<T,T>::iterator it = theset.ranges.begin();
+         it != theset.ranges.end();
+         it++) {
+      cout << " " << it->first << "-" << it->second << endl;
+    }
+  }
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/include/statlite.h b/branches/sage/cephmds2/include/statlite.h
new file mode 100644
index 0000000000000..60a977e49a499
--- /dev/null
+++ b/branches/sage/cephmds2/include/statlite.h
@@ -0,0 +1,70 @@
+#ifndef _STATLITE_H
+#define _STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+struct statlite {
+  dev_t         st_dev;      /* device */
+  ino_t         st_ino;      /* inode */
+  mode_t        st_mode;     /* protection */
+  nlink_t       st_nlink;    /* number of hard links */
+  uid_t         st_uid;      /* user ID of owner */
+  gid_t         st_gid;      /* group ID of owner */
+  dev_t         st_rdev;     /* device type (if inode device)*/
+  unsigned long st_litemask; /* bit mask for optional fields */
+  /***************************************************************/
+  /**** Remaining fields are optional according to st_litemask ***/
+  off_t         st_size;     /* total size, in bytes         */
+  blksize_t     st_blksize;  /* blocksize for filesystem I/O */
+  blkcnt_t      st_blocks;   /* number of blocks allocated   */
+  struct timespec st_atim;            /* Time of last access.  */
+  struct timespec st_mtim;            /* Time of last modification.  */
+  struct timespec st_ctim;            /* Time of last status change.  */
+  //time_t        st_atime;    /* time of last access          */
+  //time_t        st_mtime;    /* time of last modification    */
+  //time_t        st_ctime;    /* time of last change          */
+}; 
+
+#define S_STATLITE_SIZE     1
+#define S_STATLITE_BLKSIZE  2
+#define S_STATLITE_BLOCKS   4
+#define S_STATLITE_ATIME    8
+#define S_STATLITE_MTIME    16
+#define S_STATLITE_CTIME    32
+
+#define S_REQUIRESIZE(m)      (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m)   (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m)    (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m)     (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m)     (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m)     (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m)      (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m)   (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m)    (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m)     (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m)     (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m)     (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct stat       d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct statlite   d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
diff --git a/branches/sage/cephmds2/include/types.h b/branches/sage/cephmds2/include/types.h
new file mode 100644
index 0000000000000..d93d9c2f7c636
--- /dev/null
+++ b/branches/sage/cephmds2/include/types.h
@@ -0,0 +1,537 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_TYPES_H
+#define __MDS_TYPES_H
+
+extern "C" {
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+
+#include "object.h"
+
+
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b) ((a) > (b) ? (a):(b))
+#endif
+
+
+// md ops
+#define MDS_OP_STATFS   1
+
+#define MDS_OP_STAT     100
+#define MDS_OP_LSTAT    101
+#define MDS_OP_UTIME    102
+#define MDS_OP_CHMOD    103
+#define MDS_OP_CHOWN    104  
+
+
+#define MDS_OP_READDIR  200
+#define MDS_OP_MKNOD    201
+#define MDS_OP_LINK     202
+#define MDS_OP_UNLINK   203
+#define MDS_OP_RENAME   204
+
+#define MDS_OP_MKDIR    220
+#define MDS_OP_RMDIR    221
+#define MDS_OP_SYMLINK  222
+
+#define MDS_OP_OPEN     301
+#define MDS_OP_TRUNCATE 306
+#define MDS_OP_FSYNC    307
+//#define MDS_OP_CLOSE    310
+#define MDS_OP_RELEASE  308
+
+
+
+// -- stl crap --
+
+/*
+- this is to make some of the STL types work with 64 bit values, string hash keys, etc.
+- added when i was using an old STL.. maybe try taking these out and see if things 
+  compile now?
+*/
+
+namespace __gnu_cxx {
+  template<> struct hash< std::string >
+  {
+    size_t operator()( const std::string& x ) const
+    {
+      static hash<const char*> H;
+      return H(x.c_str());
+    }
+  };
+}
+
+
+/*
+ * comparators for stl containers
+ */
+// for hash_map:
+//   hash_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) == 0;
+  }
+};
+
+// for set, map
+struct ltstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) < 0;
+  }
+};
+
+
+
+/** object layout
+ * how objects are mapped into PGs
+ */
+#define OBJECT_LAYOUT_DEFAULT  0  // see g_conf
+#define OBJECT_LAYOUT_HASH     1
+#define OBJECT_LAYOUT_LINEAR   2
+#define OBJECT_LAYOUT_HASHINO  3
+#define OBJECT_LAYOUT_STARTOSD 4
+
+/** pg layout
+ * how PGs are mapped into (sets of) OSDs
+ */
+#define PG_LAYOUT_CRUSH  0   
+#define PG_LAYOUT_HASH   1
+#define PG_LAYOUT_LINEAR 2
+#define PG_LAYOUT_HYBRID 3
+
+/** FileLayout 
+ * specifies a striping and replication strategy
+ */
+
+//#define FILE_LAYOUT_CRUSH    0    // stripe via crush
+//#define FILE_LAYOUT_LINEAR   1    // stripe linearly across cluster
+
+struct FileLayout {
+  // layout
+  int object_layout;
+
+  // FIXME: make this a union?
+  // rushstripe
+  int stripe_size;     // stripe unit, in bytes
+  int stripe_count;    // over this many objects
+  int object_size;     // until objects are this big, then use a new set of objects.
+
+  // period = bytes before i start on a new set of objects.
+  int period() { return object_size * stripe_count; }
+
+  int osd;    // osdlocal
+
+  int num_rep;  // replication
+
+  FileLayout() { }
+  FileLayout(int ss, int sc, int os, int nr=2, int o=-1) :
+    object_layout(o < 0 ? OBJECT_LAYOUT_DEFAULT:OBJECT_LAYOUT_STARTOSD),
+    stripe_size(ss), stripe_count(sc), object_size(os), 
+    osd(o),
+    num_rep(nr) { }
+
+};
+
+
+
+// -- inode --
+
+//typedef __uint64_t inodeno_t;   
+
+struct inodeno_t {
+  __uint64_t val;
+  inodeno_t() : val() {}
+  inodeno_t(__uint64_t v) : val(v) {}
+  inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+  operator __uint64_t() const { return val; }
+};
+
+inline ostream& operator<<(ostream& out, inodeno_t ino) {
+  return out << hex << ino.val << dec;
+}
+
+namespace __gnu_cxx {
+  template<> struct hash< inodeno_t >
+  {
+    size_t operator()( const inodeno_t& x ) const
+    {
+      static hash<__uint64_t> H;
+      return H(x.val);
+    }
+  };
+}
+
+typedef __uint64_t version_t;
+
+
+
+#define INODE_MODE_FILE     0100000 // S_IFREG
+#define INODE_MODE_SYMLINK  0120000 // S_IFLNK
+#define INODE_MODE_DIR      0040000 // S_IFDIR
+#define INODE_TYPE_MASK     0170000
+
+#define FILE_MODE_R          1
+#define FILE_MODE_W          2
+#define FILE_MODE_RW         (1|2)
+#define FILE_MODE_LAZY       4
+
+#define INODE_MASK_BASE       1  // ino, ctime, nlink
+#define INODE_MASK_PERM       2  // uid, gid, mode
+#define INODE_MASK_SIZE       4  // size, blksize, blocks
+#define INODE_MASK_MTIME      8  // mtime
+#define INODE_MASK_ATIME      16 // atime
+
+#define INODE_MASK_ALL_STAT  (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME)
+//#define INODE_MASK_ALL_STAT  (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME)
+
+struct inode_t {
+  // base (immutable)
+  inodeno_t ino;   // NOTE: ino _must_ come first for MDStore.cc to behave!!
+  time_t    ctime;
+
+  // other
+  FileLayout layout;  // ?immutable?
+  int        nlink;   // base, 
+
+  // hard/perm (namespace permissions)
+  mode_t     mode;
+  uid_t      uid;
+  gid_t      gid;
+
+  // file (data access)
+  off_t      size;
+  time_t     atime, mtime;      // maybe atime different?  "lazy"?
+  
+  int        mask;
+
+  // special stuff
+  version_t     version;           // auth only
+  unsigned char hash_seed;         // only defined for dir; 0 if not hashed.
+  bool          anchored;          // auth only
+  version_t     file_data_version; // auth only
+
+  bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; }
+  bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; }
+  bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; }
+};
+
+
+
+// lame 128-bit value class.
+class lame128_t {
+public:
+  __uint64_t hi, lo;
+  lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {}
+};
+
+inline ostream& operator<<(ostream& out, lame128_t& oid) {
+  return out << oid.hi << "." << oid.lo;
+}
+
+
+// osd types
+//typedef __uint32_t ps_t;          // placement seed
+//typedef __uint32_t pg_t;          // placement group
+typedef __uint64_t coll_t;        // collection id
+typedef __uint64_t tid_t;         // transaction id
+
+typedef __uint32_t epoch_t;       // map epoch  (32bits -> 13 epochs/second for 10 years)
+
+// pg stuff
+typedef __uint16_t ps_t;
+typedef __uint8_t pruleset_t;
+
+// placement group id
+struct pg_t {
+  union {
+    struct {
+      int         preferred;
+      ps_t        ps;
+      __uint8_t   nrep;
+      pruleset_t  ruleset;
+    } fields;
+    __uint64_t val;
+  } u;
+  pg_t() { u.val = 0; }
+  pg_t(const pg_t& o) { u.val = o.u.val; }
+  pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) {
+    u.fields.ps = s;
+    u.fields.preferred = p;
+    u.fields.nrep = n;
+    u.fields.ruleset = r;
+  }
+  pg_t(__uint64_t v) { u.val = v; }
+  /*
+  pg_t operator=(__uint64_t v) { u.val = v; return *this; }
+  pg_t operator&=(__uint64_t v) { u.val &= v; return *this; }
+  pg_t operator+=(pg_t o) { u.val += o.val; return *this; }
+  pg_t operator-=(pg_t o) { u.val -= o.val; return *this; }
+  pg_t operator++() { ++u.val; return *this; }
+  */
+  operator __uint64_t() const { return u.val; }
+};
+
+inline ostream& operator<<(ostream& out, pg_t pg) {
+  //return out << hex << pg.val << dec;
+  if (pg.u.fields.ruleset)
+    out << (int)pg.u.fields.ruleset << '.';
+  out << (int)pg.u.fields.nrep << '.';
+  if (pg.u.fields.preferred)
+    out << pg.u.fields.preferred << '.';
+  out << hex << pg.u.fields.ps << dec;
+  return out;
+}
+
+namespace __gnu_cxx {
+  template<> struct hash< pg_t >
+  {
+    size_t operator()( const pg_t& x ) const
+    {
+      static hash<__uint64_t> H;
+      return H(x);
+    }
+  };
+}
+
+
+
+// compound rados version type
+class eversion_t {
+public:
+  epoch_t epoch;
+  version_t version;
+  eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {}
+};
+
+inline bool operator==(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) && (l.version == r.version);
+}
+inline bool operator!=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch != r.epoch) || (l.version != r.version);
+}
+inline bool operator<(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
+}
+inline bool operator<=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
+}
+inline bool operator>(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
+}
+inline bool operator>=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
+}
+inline ostream& operator<<(ostream& out, const eversion_t e) {
+  return out << e.epoch << "'" << e.version;
+}
+
+
+
+#define PG_NONE    0xffffffffL
+
+
+typedef __uint16_t snapv_t;       // snapshot version
+
+
+class OSDSuperblock {
+public:
+  const static __uint64_t MAGIC = 0xeb0f505dULL;
+  __uint64_t magic;
+  __uint64_t fsid;      // unique fs id (random number)
+  int        whoami;    // my role in this fs.
+  epoch_t    current_epoch;             // most recent epoch
+  epoch_t    oldest_map, newest_map;    // oldest/newest maps we have.
+  OSDSuperblock(__uint64_t f=0, int w=0) : 
+    magic(MAGIC), fsid(f), whoami(w), 
+    current_epoch(0), oldest_map(0), newest_map(0) {}
+};
+
+inline ostream& operator<<(ostream& out, OSDSuperblock& sb)
+{
+  return out << "sb(fsid " << sb.fsid
+             << " osd" << sb.whoami
+             << " e" << sb.current_epoch
+             << " [" << sb.oldest_map << "," << sb.newest_map
+             << "])";
+}
+
+class MonSuperblock {
+public:
+  const static __uint64_t MAGIC = 0x00eb0f5000ULL;
+  __uint64_t magic;
+  __uint64_t fsid;
+  int        whoami;  // mon #
+  epoch_t    current_epoch;
+  MonSuperblock(__uint64_t f=0, int w=0) :
+    magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {}
+};
+
+
+// new types
+
+class ObjectExtent {
+ public:
+  object_t    oid;       // object id
+  off_t       start;     // in object
+  size_t      length;    // in object
+
+  objectrev_t rev;       // which revision?
+  pg_t        pgid;      // where to find the object
+
+  map<size_t, size_t>  buffer_extents;  // off -> len.  extents in buffer being mapped (may be fragmented bc of striping!)
+  
+  ObjectExtent() : start(0), length(0), rev(0), pgid(0) {}
+  ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { }
+};
+
+inline ostream& operator<<(ostream& out, ObjectExtent &ex)
+{
+  return out << "extent(" 
+             << ex.oid << " in " << hex << ex.pgid << dec
+             << " " << ex.start << "~" << ex.length
+             << ")";
+}
+
+
+
+// client types
+typedef int        fh_t;          // file handle 
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+
+
+
+
+
+// -- io helpers --
+
+template<class A>
+inline ostream& operator<<(ostream& out, vector<A>& v) {
+  out << "[";
+  for (unsigned i=0; i<v.size(); i++) {
+    if (i) out << ",";
+    out << v[i];
+  }
+  out << "]";
+  return out;
+}
+
+template<class A>
+inline ostream& operator<<(ostream& out, const set<A>& iset) {
+  for (typename set<A>::const_iterator it = iset.begin();
+       it != iset.end();
+       it++) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A>
+inline ostream& operator<<(ostream& out, const multiset<A>& iset) {
+  for (typename multiset<A>::const_iterator it = iset.begin();
+       it != iset.end();
+       it++) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A,class B>
+inline ostream& operator<<(ostream& out, const map<A,B>& m) 
+{
+  out << "{";
+  for (typename map<A,B>::const_iterator it = m.begin();
+       it != m.end();
+       it++) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}";
+  return out;
+}
+
+
+
+
+// -- rope helpers --
+
+// string
+inline void _rope(string& s, crope& r) 
+{
+  r.append(s.c_str(), s.length()+1);
+}
+inline void _unrope(string& s, crope& r, int& off)
+{
+  s = r.c_str() + off;
+  off += s.length() + 1;
+}
+
+// set<int>
+inline void _rope(set<int>& s, crope& r)
+{
+  int n = s.size();
+  r.append((char*)&n, sizeof(n));
+  for (set<int>::iterator it = s.begin();
+       it != s.end();
+       it++) {
+    int v = *it;
+    r.append((char*)&v, sizeof(v));
+    n--;
+  }
+  assert(n==0);
+}
+inline void _unrope(set<int>& s, crope& r, int& off) 
+{
+  s.clear();
+  int n;
+  r.copy(off, sizeof(n), (char*)&n);
+  off += sizeof(n);
+  for (int i=0; i<n; i++) {
+    int v;
+    r.copy(off, sizeof(v), (char*)&v);
+    off += sizeof(v);
+    s.insert(v);
+  }
+  assert(s.size() == (unsigned)n);
+}
+
+#endif
diff --git a/branches/sage/cephmds2/include/uofs.h b/branches/sage/cephmds2/include/uofs.h
new file mode 100644
index 0000000000000..f93ef10bf2186
--- /dev/null
+++ b/branches/sage/cephmds2/include/uofs.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+/*
+ * uofs.h
+ * 
+ * user-level object-based file system
+ */
+ 
+ #ifndef _UOFS_H_
+ #define _UOFS_H_
+
+ #include <sys/types.h>
+ #include <unistd.h>
+ #include <stdio.h>
+
+
+ int device_open(char *path, int xflags);
+ void device_findsizes(int fd, long long *sz, int *bsz);
+
+ int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size,
+             int nr_hash_table_buckets, int delay_allocation, int flush_interval);
+
+ int uofs_mount(int bdev_id);
+ void uofs_shutdown(void);
+
+ int uofs_read(long long oid, void *buf, off_t offset, size_t count);
+ int uofs_write(long long oid, void *buf, off_t offset, size_t count);
+ int uofs_del(long long oid);
+ int uofs_sync(long long oid);
+ int uofs_exist(long long oid);
+
+ int uofs_get_size(long long oid);
+
+ void uofs_superblock_printout(void);
+ int  get_large_object_pages(void);
+
+ int uofs_buffer_size(void);
+ #endif
diff --git a/branches/sage/cephmds2/jobs/alc.tp b/branches/sage/cephmds2/jobs/alc.tp
new file mode 100644
index 0000000000000..c600850c54be0
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alc.tp
@@ -0,0 +1,38 @@
+#PSUB -s /bin/bash  # Sets your shell in batch
+#PSUB -c alc       # Where to run the job
+
+#PSUB -eo          # Send std error & std out to the same file
+
+#PSUB -ln  $NUM       # Number of nodes to use
+#PSUB -g   $NUM       # Total Number of tasks to use
+#PSUB -cpn 1       # cpus per node
+
+####PSUB -c 1024Mb     # memory limit
+#PSUB -lc 1500        # Core file size per process
+#PSUB -nr          # Do not automatically resubmit job
+#PSUB -tM 20m      # Select time limit. The default time limit
+                   # is only 30 minutes! Time can be HH:MM:SS or HH:MM
+
+#PSUB -o $CWD/$OUT   # filename for output
+
+# Put your commands here. Remember to 'cd' to the appropriate
+# directory, because the job will initially be in your home directory.
+# To run a parallel job, you need to use the srun.
+
+
+
+echo job $PSUB_JOBID nodes $NUM name $NAME
+
+# environment
+cd $CWD
+export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib
+
+# create fakestore dirs
+srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata  ; uptime"
+
+# go
+srun -l -N $NUM -ppbatch $CMD && touch $DONE
+
+# clean up fakestore
+srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*'
+
diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs b/branches/sage/cephmds2/jobs/alcdat/makedirs
new file mode 100644
index 0000000000000..af5a098a254c9
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/makedirs
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	#'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+	'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208],
+
+	'cper' => [15,20],
+	'_dep' => [ 'cnode' => '$nummds',# / 4 + 1',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds > 1 ? $nummds:2',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+	#'fs' => 'fakestore',
+
+	'mds_bal_rep' => 10000,  # none of that!
+	'mds_decay_halflife' => 30,
+
+	'mds_bal_interval' => 45,
+	'mds_bal_max' => [2],
+
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 100,
+	'end' => 300,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 4,
+		
+	# --meta_log_layout_scount 32 --meta_log_layout_ssize 256  
+	# --osd_pg_layout linear 
+	'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs.big b/branches/sage/cephmds2/jobs/alcdat/makedirs.big
new file mode 100644
index 0000000000000..c67b2b93dd742
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/makedirs.big
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	#'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+	'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208],
+
+	'cper' => [15,20],
+	'_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds * .8',
+				'n' => '415'],#1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+	#'fs' => 'fakestore',
+
+	'mds_bal_rep' => 10000,  # none of that!
+	'mds_decay_halflife' => 30,
+
+	'mds_bal_interval' => 45,
+	'mds_bal_max' => 2,
+
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 100,
+	'end' => 300,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 4,
+		
+	# --meta_log_layout_scount 32 --meta_log_layout_ssize 256  
+	# --osd_pg_layout linear 
+	'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs.tput b/branches/sage/cephmds2/jobs/alcdat/makedirs.tput
new file mode 100644
index 0000000000000..8dd5ae4c47d8c
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/makedirs.tput
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	#'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+	'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208],
+
+	#'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150],
+	'cper' => [13, 30, 40],  # just for final run...
+	'_dep' => [ 'cnode' => '$nummds',# / 4 + 1',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+	#'fs' => 'fakestore',
+
+	'mds_bal_rep' => 10000,  # none of that!
+	'mds_decay_halflife' => 30,
+
+	'mds_bal_interval' => 45,
+	'mds_bal_max' => 2,
+
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 100,
+	'end' => 300,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 4,
+		
+	# --meta_log_layout_scount 32 --meta_log_layout_ssize 256  
+	# --osd_pg_layout linear 
+	'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	'comb' => {
+		'x' => 'cper',#nummds',
+		'vars' => [ 'mds.req', 'cl.lat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/makefiles.shared b/branches/sage/cephmds2/jobs/alcdat/makefiles.shared
new file mode 100644
index 0000000000000..ab96702c73289
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/makefiles.shared
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96],
+
+	'cper' => [25, 50, 100, 150],# 100, 150, 200],
+
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+
+	'mds_bal_hash_wr' => 1000,
+
+	'until' => 180,     # --syn until $n    ... when to stop clients
+	'kill_after' => 250,
+	'start' => 30,
+	'end' => 180,
+	
+	'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0',
+	
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req', 'cl.lat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/openshared b/branches/sage/cephmds2/jobs/alcdat/openshared
new file mode 100644
index 0000000000000..5ed7ba95894b3
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/openshared
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => [1, 4, 16, 64, 128, 192 ],
+
+	'cper' => [10, 50, 100, 150],
+	'_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds',
+				'numclient' => '$nummds*$cper',
+				'numosd' => '$nummds > 30 ? 30:$nummds',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+
+	'mds_bal_interval' => 10000,
+	'mds_bal_hash_wr' => 1000,
+
+	'until' => 120,     # --syn until $n    ... when to stop clients
+	'kill_after' => 180,
+	'start' => 10,
+	'end' => 120,
+	
+	'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000',
+	
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req', 'cl.lat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.include b/branches/sage/cephmds2/jobs/alcdat/ossh.include
new file mode 100644
index 0000000000000..c9a368ba5c60f
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/ossh.include
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 10,
+
+	#'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	#'nummds' => [1, 2, 4, 6, 7], # googoo
+	'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ],
+
+	#'trace' => ['make.lib', 'make.include'],
+
+	'mds_bal_interval' => 45,
+	'mds_bal_max' => 2,#6, #[ 2,4,6 ],
+	'mds_decay_halflife' => 30,
+	'mds_bal_rep' => 1500,
+	'mds_bal_hash_rd' => 100000,
+
+	'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+	#'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ],
+
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000',
+
+	# parameters
+	'fs' => 'ebofs',
+
+	#'until' => 500,   
+	#'kill_after' => 600,
+	#'start' => 200,
+	#'end' => 500,
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 200,
+	'end' => 300,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.include.big b/branches/sage/cephmds2/jobs/alcdat/ossh.include.big
new file mode 100644
index 0000000000000..b92895a53a763
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/ossh.include.big
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 10,
+
+	#'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	#'nummds' => [1, 2, 4, 6, 7], # googoo
+	#'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ],
+	'nummds' => [160,200],
+
+	#'trace' => ['make.lib', 'make.include'],
+
+	'mds_bal_interval' => 45,
+	'mds_bal_max' => 2,#6, #[ 2,4,6 ],
+	'mds_decay_halflife' => 30,
+	'mds_bal_rep' => 1500,
+	'mds_bal_hash_rd' => 100000,
+
+	'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+	#'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ],
+
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds * .6',
+				'n' => '415'],#1 + $cnode + $nummds + $numosd' ],
+	
+	'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000',
+
+	# parameters
+	'fs' => 'ebofs',
+
+	#'until' => 500,   
+	#'kill_after' => 600,
+	#'start' => 200,
+	#'end' => 500,
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 200,
+	'end' => 300,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.lib b/branches/sage/cephmds2/jobs/alcdat/ossh.lib
new file mode 100644
index 0000000000000..73372866f051f
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/ossh.lib
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 10,
+
+	#'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	
+	#'nummds' => [1, 2, 4, 6, 7], # googoo
+	#'trace' => ['make.lib', 'make.include'],
+	
+	'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90],
+	#'mds_bal_max' => [4, 10],#6,#[2,4,6,8],
+	
+	'mds_decay_halflife' => 30,
+	'mds_bal_rep' => 1500,
+	'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+
+
+	'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000',
+	
+	# parameters
+	#'fs' => ['fakestore'],
+	'fs' => 'ebofs',
+
+	#'until' => 500,   
+	#'kill_after' => 600,
+	#'start' => 200,
+	#'end' => 500,
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 150,
+	'end' => 300,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big b/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big
new file mode 100644
index 0000000000000..b9e0dd1ff68cd
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 10,
+
+	#'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	#'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+	'nummds' => [160,200],
+	
+	#'nummds' => [1, 2, 4, 6, 7], # googoo
+	#'trace' => ['make.lib', 'make.include'],
+	
+	'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90],
+	#'mds_bal_max' => [4, 10],#6,#[2,4,6,8],
+	
+	'mds_decay_halflife' => 30,
+	'mds_bal_rep' => 1500,
+	'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+
+	'_dep' => [ 'cnode' => 0,#'30',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds * .6',
+				'n' => '415'],#'1 + $cnode + $nummds + $numosd' ],
+
+
+	'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000',
+	
+	# parameters
+	#'fs' => ['fakestore'],
+	'fs' => 'ebofs',
+
+	#'until' => 500,   
+	#'kill_after' => 600,
+	#'start' => 200,
+	#'end' => 500,
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 150,
+	'end' => 300,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/alcdat/striping b/branches/sage/cephmds2/jobs/alcdat/striping
new file mode 100644
index 0000000000000..de71828d12bde
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/alcdat/striping
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numosd' => 10,
+
+	'cnode' => 10, 
+	'cper' => [ 10, 25, 50, 100 ],
+
+	'_dep' => [ 'numclient' => '$cper * $cnode',
+				'n' => '1 + $cnode + $nummds + $numosd',
+				'file_layout_osize' => '$writefile_size' ],
+	
+	# parameters
+	'fs' => 'ebofs',
+	#'fs' => 'fakestore',
+	
+	'until' => 160,     # --syn until $n    ... when to stop clients
+	'kill_after' => 200,
+	'start' => 100,
+	'end' => 160,
+	
+	'writefile' => 1,
+	'writefile_size' => [ 
+#						  4*1024*1024,
+						  1024*1024 ],
+#						  256*1024,
+#						  64*1024 
+	'writefile_mb' => 100000,
+
+		'osd_pg_bits' => 10,#16,
+		#'osd_pg_bits' => [ 16, 20 ],
+	
+		#'osd_object_layout' => [ 'hash', 'hashino', 'linear' ],
+	'osd_pg_layout' => [ 'crush', 
+#						 'hash', 
+						 'linear' ],
+
+	'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60',
+	
+	'comb' => {
+		'x' => 'cper',#writefile_size',
+		'vars' => [ 'osd.c_wrb', 'osd.c_wr' ],
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/mds/log_striping b/branches/sage/cephmds2/jobs/mds/log_striping
new file mode 100644
index 0000000000000..46242cdda4f00
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/mds/log_striping
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+	'kill_after' => 300,
+
+	'nummds' => 1,
+	'numosd' => 8,
+	'numclient' => 100,
+	'n' => 16,
+
+	# parameters
+	'fs' => ['ebofs','fakestore'],
+	'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ],
+	'meta_log_scount' => 4,#[ 1, 2, 4, 8 ],
+
+	'until' => 200,     # --syn until $n    ... when to stop clients
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 4,
+		
+	'custom' => '--tcp_skip_rank0',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	# for final summation (script/sum.pl)
+	'start' => 100,
+	'end' => 550,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/mds/makedir_lat b/branches/sage/cephmds2/jobs/mds/makedir_lat
new file mode 100644
index 0000000000000..63374f52a36c0
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/mds/makedir_lat
@@ -0,0 +1,33 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numosd' => 8,
+	'numclient' => [1],#, 40, 80, 160 ],
+	'n' => 20,
+
+	'fs' => 'ebofs',
+
+	'start' => 20,
+	'end' => 40,
+	'until' => 40,  
+	'kill_after' => 60,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 5,
+
+	'mds_local_osd' => [ 0, 1 ],
+	'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4],
+		
+	'custom' => '--tcp_skip_rank0',
+
+	'comb' => {
+		'x' => 'meta_log_layout_num_rep',
+		'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/mds/makedirs b/branches/sage/cephmds2/jobs/mds/makedirs
new file mode 100644
index 0000000000000..4ca42d72fa37e
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/mds/makedirs
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'_psub' => 'jobs/alc.tp',
+
+	'sleep' => 3,
+
+	'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64],
+
+	'cper' => 50,
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$cnode * $cper',
+				'numosd' => '$nummds * 2',
+				'n' => '1 + $cnode + $nummds + $numosd' ],
+	
+	# parameters
+	#'fs' => 'ebofs',
+	'fs' => 'fakestore',
+
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 3,
+		
+	'custom' => '--tcp_skip_rank0',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	# for final summation (script/sum.pl)
+	'start' => 100,
+	'end' => 550,
+
+	'comb' => {
+		'x' => 'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/mds/opensshlib b/branches/sage/cephmds2/jobs/mds/opensshlib
new file mode 100644
index 0000000000000..d8b61ae52c655
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/mds/opensshlib
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => [1, 2, 4, 7], # googoo
+	#'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc
+
+	
+	# parameters
+	'fs' => 'ebofs',
+	#'fs' => 'fakestore',
+
+	'until' => 300,     # --syn until $n    ... when to stop clients
+	'kill_after' => 400,
+	'start' => 150,
+	'end' => 300,
+	
+	'mds_bal_interval' => 90,#[60, 90],
+	#'mds_bal_max' => [3,4,5],
+	'mds_bal_max' => 4,
+	'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60],
+	'mds_bal_rep' => 1500,#[1000, 1500, 2000],
+
+	'decay_hl' => 100,#[ 25, 50, 100, 150 ],
+
+	'cper' => 100, #[50, 75, 100, 125, 150, 200],
+	'_dep' => [ 'cnode' => '$nummds',
+				'numclient' => '$nummds * $cper',
+				'numosd' => '$nummds * 2',
+				'n' => '1 + $cnode + $nummds + $numosd',
+				'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'],
+		
+	'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	# for final summation (script/sum.pl)
+
+	'comb' => {
+		'x' => 'nummds',#decay_hl',#'nummds',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/meta1 b/branches/sage/cephmds2/jobs/meta1
new file mode 100644
index 0000000000000..743212f1c3009
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/meta1
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+# makedirs for 300 seconds
+#  first bit in memory
+#  second bit is commiting from journal too
+# then walk fs for 300 seconds
+#  this should all be in memory.
+
+JOB="meta1"
+ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000"
+
+#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1
+#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2
+#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4
+#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8
+#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12
+rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16
+
+
diff --git a/branches/sage/cephmds2/jobs/meta1.proc.sh b/branches/sage/cephmds2/jobs/meta1.proc.sh
new file mode 100755
index 0000000000000..616acbefff619
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/meta1.proc.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+for d in 1 2 4 8 12
+do
+ echo $d
+ cd $d
+ ../../../script/sum.pl mds? mds?? > mds.sum
+ ../../../script/sum.pl -avg mds? mds?? > mds.avg
+
+ ../../../script/sum.pl -start  90 -end 180 mds? mds?? > mds.sum.makedirs
+ ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk
+
+ cd ..
+done
diff --git a/branches/sage/cephmds2/jobs/osd/ebofs b/branches/sage/cephmds2/jobs/osd/ebofs
new file mode 100644
index 0000000000000..5d11523f6f832
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/ebofs
@@ -0,0 +1,51 @@
+# hi there
+{
+	# startup
+	'n' => 30,          # mpi nodes
+	'sleep' => 3,      # seconds between runs
+	'nummds' => 1,
+	'numosd' => 8,
+	'numclient' => 100,#[10, 50, 100, 200, 400],
+	
+'kill_after' => 200,
+
+	# parameters
+	'fs' => 'ebofs',#[ 
+#			  'obfs',
+#			  'fakestore',
+#			  'ebofs' 
+#			  ],
+	'until' => 100,     # --syn until $n    ... when to stop clients
+	'writefile' => 1,
+	'writefile_size' => [ 
+#						  2560000,
+						  1024000,
+						  262144,
+#						  131072,
+#						  98304,
+						  65536,
+#						  16384,
+#						  4096,
+						  256,
+#						  16,
+#						  1
+						  ],
+	'writefile_mb' => 1000,
+	
+	'ebofs_idle_commit_ms' => [ 100, 500 ],
+	'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
+
+#	'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0',
+			'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+	# for final summation (script/sum.pl)
+	'start' => 30,
+	'end' => 90,
+
+'comb' => {
+			'x' => 'writefile_size',
+			'vars' => [ 'osd.c_wrb' ],
+#			'maptitle' => { 'osd_object_layout=' => '',
+#							',osd_pg_layout=' => ' + '}
+			}
+};
diff --git a/branches/sage/cephmds2/jobs/osd/mds_log b/branches/sage/cephmds2/jobs/osd/mds_log
new file mode 100644
index 0000000000000..0f99f6998dcfc
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/mds_log
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	#'_psub' => 'jobs/alc.tp',
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400],
+	#'numclient' => [ 50, 100, 200 ],
+	'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], 
+	'n' => 12,
+
+	# parameters
+	'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'],
+	#'fs' => 'ebofs',
+	#'ebofs_commit_ms' => [ 1000, 5000 ],
+	#'osd_maxthreads' => [ 0, 1, 2, 4, 8 ],
+
+	'until' => 100,     # --syn until $n    ... when to stop clients
+	'kill_after' => 300,
+	'start' => 20,
+	'end' => 90,
+	
+	'makedirs' => 1,
+	'makedirs_dirs' => 10,
+	'makedirs_files' => 10,
+	'makedirs_depth' => 3,
+		
+
+	#'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400],
+	#'meta_log_layout_scount' => [2, 4, 8],
+	#'meta_log_layout_num_rep' => [1, 2],
+	#'meta_log_layout_num_rep' => 1,
+
+	'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	'comb' => {
+		'x' => 'numclient',#'meta_log_layout_ssize',
+		'vars' => [ 'mds.req' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/osd/osd_threads b/branches/sage/cephmds2/jobs/osd/osd_threads
new file mode 100644
index 0000000000000..ef271f9e88710
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/osd_threads
@@ -0,0 +1,33 @@
+# hi there
+{
+	# startup
+	'n' => 30,          # mpi nodes
+	'sleep' => 10,      # seconds between runs
+	'nummds' => 1,
+	'numosd' => 8,
+	'numclient' => 50,
+	
+	# parameters
+	'fs' => [ 
+#			  'obfs',
+			  'fakestore',
+			  'ebofs' 
+			  ],
+	'until' => 100,     # --syn until $n    ... when to stop clients
+	'writefile' => 1,
+	'writefile_size' => [ 
+						  1024000,
+						  131072,
+						  65536,
+						  16
+						  ],
+	'writefile_mb' => 1000,
+
+	'osd_maxthreads' => [0, 1, 2, 4, 8],
+	
+	'custom' => '--tcp_skip_rank0',
+	
+	# for final summation (script/sum.pl)
+	'start' => 30,
+	'end' => 90
+};
diff --git a/branches/sage/cephmds2/jobs/osd/striping b/branches/sage/cephmds2/jobs/osd/striping
new file mode 100644
index 0000000000000..ea8cabe643274
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/striping
@@ -0,0 +1,78 @@
+#!/usr/bin/perl
+# hi there
+{
+	# startup
+	#'n' => 28,          # mpi nodes
+
+	'sleep' => 3,      # seconds between runs
+	'nummds' => 1,
+
+	'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16],
+	'numosd' => [14],
+	#'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], 
+	#'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], 
+	'cper' => [30],
+	
+	'_dep' => [ 'cnode' => '$numosd',
+				'numclient' => '$cnode * $cper',
+				'n' => 38],#'$nummds + $numosd + $cnode'],
+	#'numclient' => [5, 10, 20, 50, 75, 100, 150 ],
+	
+	'start' => 30,
+	'end' => 90,
+	'until' => 100,     # --syn until $n    ... when to stop clients
+	'kill_after' => 260,
+
+	# parameters
+	'fs' => 'ebofs',
+	'writefile' => 1,
+
+	'writefile_size' => [# 4096,
+						 # 16*1024,
+						 # 64*1024,
+						 # 256*1024,
+						  1024*1024 ],
+#	'writefile_size' => [
+#						 2048*1024,
+#		1048576,
+#						 512*1024,
+#						 262144,
+#						 65536,
+#						 16384
+#						 ],
+	'writefile_mb' => 1000,
+	
+	'file_layout_num_rep'=> [1,2,3],
+
+	'osd_pg_bits' => 12,#[6, 8, 10, 12, 14],
+	
+	'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ],
+	'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], 
+
+	#'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0',
+	#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+	
+	# for final summation (script/sum.pl)
+
+	'comb' => {
+		'x' => 'numosd',#'writefile_size',
+		'vars' => [ 'osd.c_wrb', 'cl.wrlat' ],
+#			'maptitle' => { 'osd_object_layout=' => '',
+#							',osd_pg_layout=' => ' + '}
+	}
+};
+
+
+=item some googoo notes
+
+for 1mb 1x writes, 
+
+ with numosd=6, min cper=6 to saturate (cper_saturate)
+ googoo saturates at numosd=8.  (osd_saturate)
+
+    -> so, numosd=6 or 7 is a safe size!
+
+
+
+
+=cut
diff --git a/branches/sage/cephmds2/jobs/osd/wr_lat2 b/branches/sage/cephmds2/jobs/osd/wr_lat2
new file mode 100644
index 0000000000000..47053dd61f3ab
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/wr_lat2
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numosd' => [12],
+	'numclient' => [1],#, 40, 80, 160 ],
+	'n' => 16,
+
+	'fs' => 'ebofs',
+
+	'start' => 10,
+	'end' => 40,
+	'until' => 40,  
+	'kill_after' => 90,
+	
+	'writefile' => 1,
+	'writefile_size' => [4096, 
+						 8*1024,
+						 16*1024, 
+						 32*1024,
+						 64*1024,
+						 128*1024,
+						 256*1024,
+						 512*1024,
+						 1024*1024],
+	'writefile_mb' => 10000,
+
+	#'tcp_multi_out' => [0,1],
+
+#		'mds_local_osd' => [ 0, 1 ],
+	'file_layout_num_rep' => [1,2,3],#, 2, 3, 4],
+
+	'client_oc' => [0,1],
+		
+	'custom' => '--tcp_skip_rank0',
+
+	'comb' => {
+		'x' => 'writefile_size',#'file_layout_num_rep',
+		'vars' => [ 'osd.c_wrb','cl.wrlat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/osd/write_sizes b/branches/sage/cephmds2/jobs/osd/write_sizes
new file mode 100644
index 0000000000000..57369f3a97c50
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/osd/write_sizes
@@ -0,0 +1,60 @@
+#!/usr/bin/perl
+# hi there
+{
+	# startup
+	'n' => 30,          # mpi nodes
+	'sleep' => 3,      # seconds between runs
+	'nummds' => 1,
+	'numosd' => 6,
+	'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400],
+	
+	'until' => 100,     # --syn until $n    ... when to stop clients
+	'kill_after' => 300,
+
+	# parameters
+	'fs' => [ 
+#			  'obfs',
+			  'fakestore',
+#			  'ebofs' 
+			  ],
+	'writefile' => 1,
+	'writefile_size' => [ 
+#						  2048*1024,
+						  1024*1024,
+						  512*1024,
+						  256*1024,
+						  128*1024,
+						  64*1024,
+						  48*1024,
+						  32*1024,
+						  28*1024,
+						  24*1024,
+						  16*1024,
+						  12*1024,
+						  8*1024,
+						  4096,
+#						  256,
+#						  16,
+#						  1
+						  ],
+	'writefile_mb' => 1000,
+	
+	'file_layout_num_rep'=> 1,#[1,2],
+
+
+#	'ebofs_idle_commit_ms' => [ 100, 500 ],
+#	'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
+
+	'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60',
+
+	# for final summation (script/sum.pl)
+	'start' => 30,
+	'end' => 90,
+
+	'comb' => {
+		'x' => 'writefile_size',
+		'vars' => [ 'osd.c_wrb' ],
+#			'maptitle' => { 'osd_object_layout=' => '',
+#							',osd_pg_layout=' => ' + '}
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/rados/map_dist b/branches/sage/cephmds2/jobs/rados/map_dist
new file mode 100644
index 0000000000000..39f16daa1cdc2
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/rados/map_dist
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'osdbits' => [6,7,8],#,9],10,11],
+	'pgperbits' => [3],#,4,5],#[4,6,8],
+
+	'nummds' => 1,
+
+	'_dep' => [ 'numosd' => '1 << $osdbits',
+				'osd_pg_bits' => '$pgperbits + $osdbits',
+				'n' => '3 + $numosd / 32'],
+	'numclient' => 0,
+
+	'fake_osdmap_updates' => [30],
+
+	'fs' => 'ebofs',
+
+	'start' => 30,
+	'end' => 300,
+	'kill_after' => 300,
+	
+	'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0',
+	#'custom' => '--tcp_skip_rank0',
+
+	'comb' => {
+		'x' => 'osdbits',
+		'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/rados/rep_lat b/branches/sage/cephmds2/jobs/rados/rep_lat
new file mode 100644
index 0000000000000..3f5ab0c8a7d87
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/rados/rep_lat
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numosd' => 8, #[6],
+	'numclient' => 1,#, 40, 80, 160 ],
+	'n' => 10,
+
+	'fs' => 'ebofs',
+
+	'start' => 10,
+	'end' => 40,
+	'until' => 40,  
+	'kill_after' => 45,
+	
+	'writefile' => 1,
+	'writefile_size' => [4096, 
+#						 8*1024,
+#						 16*1024, 
+#						 32*1024,
+						 64*1024,
+#						 128*1024,
+#						 256*1024,
+#						 512*1024,
+#						 1024*1024
+],
+	'writefile_mb' => 10000,
+
+	'osd_rep' => [0,1,2],
+
+	'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4],
+
+	'osd_pg_bits' => 4,
+	'custom' => '--osd_max_rep 8',
+
+	'comb' => {
+		'x' => 'file_layout_num_rep',
+		'vars' => [ 'cl.wrlat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/jobs/rados/wr_sizes b/branches/sage/cephmds2/jobs/rados/wr_sizes
new file mode 100644
index 0000000000000..9b73477ea6142
--- /dev/null
+++ b/branches/sage/cephmds2/jobs/rados/wr_sizes
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# hi there
+{
+	'sleep' => 3,
+
+	'nummds' => 1,
+	'numosd' => [8],#10,14,16],
+	'numclient' => [10*16],
+	'n' => 15,
+
+	'fs' => 'ebofs',
+
+	'start' => 60,
+	'end' => 90,
+	'until' => 90,  
+	'kill_after' => 190,
+	
+	'writefile' => 1,
+	'writefile_size' => [4096, 
+						 8*1024,
+						 16*1024, 
+						 32*1024,
+						 64*1024, 
+						 128*1024,
+						 256*1024,
+	#					 512*1024,
+#						 4*1024*1024,
+#						 2*1024*1024,
+#						 1024*1024
+],
+	'writefile_mb' => 10000,
+
+	'file_layout_num_rep' => 1,
+	'file_layout_ssize' => 4*1024*1024,
+	'file_layout_osize' => 4*1024*1024,
+
+	'osd_pg_bits' => 12,
+
+#	'ebofs_freelist' => [0, 1080, 65400],
+
+	'custom' => '--objecter_buffer_uncommitted 0',
+
+		#'custom' => '--tcp_skip_rank0',
+
+	'comb' => {
+		'x' => 'writefile_size',
+		'vars' => [ 'osd.c_wrb', 'cl.wrlat' ]
+	}
+};
diff --git a/branches/sage/cephmds2/mds/Anchor.h b/branches/sage/cephmds2/mds/Anchor.h
new file mode 100644
index 0000000000000..8da2bbdb52cd5
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Anchor.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __ANCHOR_H
+#define __ANCHOR_H
+
+#include <string>
+using std::string;
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+class Anchor {
+public:
+  inodeno_t ino;      // my ino
+  inodeno_t dirino;   // containing dir
+  string    ref_dn;   // referring dentry
+  int       nref;     // reference count
+
+  Anchor() {}
+  Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) {
+    this->ino = ino;
+    this->dirino = dirino;
+    this->ref_dn = ref_dn;
+    this->nref = nref;
+  }  
+  
+  void _encode(bufferlist &bl) {
+    bl.append((char*)&ino, sizeof(ino));
+    bl.append((char*)&dirino, sizeof(dirino));
+    bl.append((char*)&nref, sizeof(nref));
+    ::_encode(ref_dn, bl);
+  }
+  void _decode(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    bl.copy(off, sizeof(dirino), (char*)&dirino);
+    off += sizeof(dirino);
+    bl.copy(off, sizeof(nref), (char*)&nref);
+    off += sizeof(nref);
+    ::_decode(ref_dn, bl, off);
+  }
+} ;
+
+#endif
diff --git a/branches/sage/cephmds2/mds/AnchorClient.cc b/branches/sage/cephmds2/mds/AnchorClient.cc
new file mode 100644
index 0000000000000..b330a93cec6ca
--- /dev/null
+++ b/branches/sage/cephmds2/mds/AnchorClient.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <iostream>
+using std::cout;
+using std::cerr;
+using std::endl;
+
+#include "Anchor.h"
+#include "AnchorClient.h"
+#include "MDSMap.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+
+#include "MDS.h"
+
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient "
+#define derr(x)  if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient "
+
+
+void AnchorClient::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_MDS_ANCHORREPLY:
+    handle_anchor_reply((MAnchorReply*)m);
+    break;
+
+  default:
+    assert(0);
+  }
+}
+
+void AnchorClient::handle_anchor_reply(class MAnchorReply *m)
+{
+  switch (m->get_op()) {
+
+  case ANCHOR_OP_LOOKUP:
+    {
+      assert(pending_lookup_trace.count(m->get_ino()) == 1);
+
+      *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace();
+      Context *onfinish = pending_lookup_context[ m->get_ino() ];
+
+      pending_lookup_trace.erase(m->get_ino());
+      pending_lookup_context.erase(m->get_ino());
+
+      if (onfinish) {
+        onfinish->finish(0);
+        delete onfinish;
+      }
+    }
+    break;
+
+  case ANCHOR_OP_UPDATE:
+  case ANCHOR_OP_CREATE:
+  case ANCHOR_OP_DESTROY:
+    {
+      assert(pending_op.count(m->get_ino()) == 1);
+
+      Context *onfinish = pending_op[m->get_ino()];
+      pending_op.erase(m->get_ino());
+
+      if (onfinish) {
+        onfinish->finish(0);
+        delete onfinish;
+      }
+    }
+    break;
+
+  default:
+    assert(0);
+  }
+
+}
+
+
+
+/*
+ * public async interface
+ */
+
+void AnchorClient::lookup(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+  // send message
+  MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino);
+
+  pending_lookup_trace[ino] = &trace;
+  pending_lookup_context[ino] = onfinish;
+
+  messenger->send_message(req, 
+			  MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+			  MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::create(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+  // send message
+  MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino);
+  req->set_trace(trace);
+
+  pending_op[ino] = onfinish;
+
+  messenger->send_message(req, 
+			  MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+			  MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::update(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+  // send message
+  MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino);
+  req->set_trace(trace);
+  
+  pending_op[ino] = onfinish;
+  
+  messenger->send_message(req, 
+			  MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+			  MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::destroy(inodeno_t ino, Context *onfinish)
+{
+  // send message
+  MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino);
+
+  pending_op[ino] = onfinish;
+
+  messenger->send_message(req, 
+			  MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+			  MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+
diff --git a/branches/sage/cephmds2/mds/AnchorClient.h b/branches/sage/cephmds2/mds/AnchorClient.h
new file mode 100644
index 0000000000000..80b736a4b65c7
--- /dev/null
+++ b/branches/sage/cephmds2/mds/AnchorClient.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __ANCHORCLIENT_H
+#define __ANCHORCLIENT_H
+
+#include <vector>
+using std::vector;
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+
+#include "include/types.h"
+#include "msg/Dispatcher.h"
+
+#include "Anchor.h"
+
+class Messenger;
+class MDSMap;
+class Context;
+
+class AnchorClient : public Dispatcher {
+  Messenger *messenger;
+  MDSMap *mdsmap;
+
+  // remote state
+  hash_map<inodeno_t, Context*>  pending_op;
+  hash_map<inodeno_t, Context*>  pending_lookup_context;
+  hash_map<inodeno_t, vector<Anchor*>*>  pending_lookup_trace;
+
+  void handle_anchor_reply(class MAnchorReply *m);  
+
+
+public:
+  AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {}
+  
+  // async user interface
+  void lookup(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+  void create(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+  void update(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+  void destroy(inodeno_t ino, Context *onfinish);
+
+  void dispatch(Message *m);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/AnchorTable.cc b/branches/sage/cephmds2/mds/AnchorTable.cc
new file mode 100644
index 0000000000000..7b881de0339da
--- /dev/null
+++ b/branches/sage/cephmds2/mds/AnchorTable.cc
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "AnchorTable.h"
+#include "MDS.h"
+
+#include "osdc/Filer.h"
+
+#include "msg/Messenger.h"
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+
+#include "common/Clock.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable "
+#define derr(x)  if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable "
+
+AnchorTable::AnchorTable(MDS *mds)
+{
+  this->mds = mds;
+  opening = false;
+  opened = false;
+  
+  memset(&table_inode, 0, sizeof(table_inode));
+  table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid();
+  table_inode.layout = g_OSD_FileLayout;
+}
+
+/*
+ * basic updates
+ */
+
+bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn) 
+{
+  dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl;
+  
+  // parent should be there
+  assert(dirino < 1000 ||             // system dirino
+         anchor_map.count(dirino));   // have
+  
+  if (anchor_map.count(ino) == 0) {
+    // new item
+    anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn);
+    dout(10) << "  add: added " << std::hex << ino << std::dec << endl;
+    return true;
+  } else {
+    dout(10) << "  add: had " << std::hex << ino << std::dec << endl;
+    return false;
+  }
+}
+
+void AnchorTable::inc(inodeno_t ino)
+{
+  dout(7) << "inc " << std::hex << ino << std::dec << endl;
+
+  assert(anchor_map.count(ino) != 0);
+  Anchor *anchor = anchor_map[ino];
+  assert(anchor);
+
+  while (1) {
+    anchor->nref++;
+      
+    dout(10) << "  inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl;
+    ino = anchor->dirino;
+    
+    if (ino == 0) break;
+    if (anchor_map.count(ino) == 0) break;
+    anchor = anchor_map[ino];      
+    assert(anchor);
+  }
+}
+
+void AnchorTable::dec(inodeno_t ino) 
+{
+  dout(7) << "dec " << std::hex << ino << std::dec << endl;
+
+  assert(anchor_map.count(ino) != 0);
+  Anchor *anchor = anchor_map[ino];
+  assert(anchor);
+
+  while (true) {
+    anchor->nref--;
+      
+    if (anchor->nref == 0) {
+      dout(10) << "  dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl;
+      inodeno_t dirino = anchor->dirino;
+      anchor_map.erase(ino);
+      delete anchor;
+      ino = dirino;
+    } else {
+      dout(10) << "  dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl;
+      ino = anchor->dirino;
+    }
+    
+    if (ino == 0) break;
+    if (anchor_map.count(ino) == 0) break;
+    anchor = anchor_map[ino];      
+    assert(anchor);
+  }
+}
+
+
+/* 
+ * high level 
+ */
+
+void AnchorTable::lookup(inodeno_t ino, vector<Anchor*>& trace)
+{
+  dout(7) << "lookup " << std::hex << ino << std::dec << endl;
+
+  assert(anchor_map.count(ino) == 1);
+  Anchor *anchor = anchor_map[ino];
+  assert(anchor);
+
+  while (true) {
+    dout(10) << "  record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl;
+    trace.insert(trace.begin(), anchor);  // lame FIXME
+
+    if (anchor->dirino < MDS_INO_BASE) break;
+
+    assert(anchor_map.count(anchor->dirino) == 1);
+    anchor = anchor_map[anchor->dirino];
+    assert(anchor);
+  }
+}
+
+void AnchorTable::create(inodeno_t ino, vector<Anchor*>& trace)
+{
+  dout(7) << "create " << std::hex << ino << std::dec << endl;
+  
+  // make sure trace is in table
+  for (unsigned i=0; i<trace.size(); i++) 
+    add(trace[i]->ino, trace[i]->dirino, trace[i]->ref_dn);
+
+  inc(ino);  // ok!
+}
+
+void AnchorTable::destroy(inodeno_t ino)
+{
+  dec(ino);
+}
+
+
+
+/*
+ * messages 
+ */
+
+void AnchorTable::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_MDS_ANCHORREQUEST:
+    handle_anchor_request((MAnchorRequest*)m);
+    break;
+    
+  default:
+    assert(0);
+  }
+}
+
+
+
+void AnchorTable::handle_anchor_request(class MAnchorRequest *m)
+{
+  // make sure i'm open!
+  if (!opened) {
+    dout(7) << "not open yet" << endl;
+    
+    waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m));
+    
+    if (!opening) {
+      opening = true;
+      load(0);
+    }
+    return;
+  }
+
+  // go
+  MAnchorReply *reply = new MAnchorReply(m);
+  
+  switch (m->get_op()) {
+
+  case ANCHOR_OP_LOOKUP:
+    lookup( m->get_ino(), reply->get_trace() );
+    break;
+
+  case ANCHOR_OP_UPDATE:
+    destroy( m->get_ino() );
+    create( m->get_ino(), m->get_trace() );
+    break;
+
+  case ANCHOR_OP_CREATE:
+    create( m->get_ino(), m->get_trace() );
+    break;
+
+  case ANCHOR_OP_DESTROY:
+    destroy( m->get_ino() );
+    break;
+
+  default:
+    assert(0);
+  }
+
+  // send reply
+  mds->messenger->send_message(reply, m->get_source(), m->get_source_port());
+  delete m;
+}
+
+
+
+
+// primitive load/save for now!
+
+// load/save entire table for now!
+
+void AnchorTable::save(Context *onfinish)
+{
+  dout(7) << "save" << endl;
+  if (!opened) return;
+  
+  // build up write
+  bufferlist tabbl;
+
+  int num = anchor_map.size();
+  tabbl.append((char*)&num, sizeof(int));
+
+  for (hash_map<inodeno_t, Anchor*>::iterator it = anchor_map.begin();
+       it != anchor_map.end();
+       it++) {
+    dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl;
+    Anchor *a = it->second;
+    assert(a);
+    a->_encode(tabbl);
+  }
+
+  bufferlist bl;
+  size_t size = tabbl.length();
+  bl.append((char*)&size, sizeof(size));
+  bl.claim_append(tabbl);
+
+  dout(7) << " " << num << " anchors, " << size << " bytes" << endl;
+  
+  // write!
+  mds->filer->write(table_inode,
+                    0, bl.length(),
+                    bl, 0, 
+                    NULL, onfinish);
+}
+
+
+
+class C_AT_Load : public Context {
+  AnchorTable *at;
+public:
+  size_t size;
+  bufferlist bl;
+  C_AT_Load(size_t size, AnchorTable *at) {
+    this->size = size;
+    this->at = at;
+  }
+  void finish(int result) {
+    assert(result > 0);
+
+    at->load_2(size, bl);
+  }
+};
+
+class C_AT_LoadSize : public Context {
+  AnchorTable *at;
+  MDS *mds;
+public:
+  bufferlist bl;
+  C_AT_LoadSize(AnchorTable *at, MDS *mds) {
+    this->at = at;
+    this->mds = mds;
+  }
+  void finish(int r) {
+    size_t size = 0;
+    assert(bl.length() >= sizeof(size));
+    bl.copy(0, sizeof(size), (char*)&size);
+    cout << "r is " << r << " size is " << size << endl;
+    if (r > 0 && size > 0) {
+      C_AT_Load *c = new C_AT_Load(size, at);
+      mds->filer->read(at->table_inode,
+                       sizeof(size), size,
+                       &c->bl,
+                       c);
+    } else {
+      // fail
+      bufferlist empty;
+      at->load_2(0, empty);
+    }
+  }
+};
+
+void AnchorTable::load(Context *onfinish)
+{
+  dout(7) << "load" << endl;
+
+  assert(!opened);
+
+  waiting_for_open.push_back(onfinish);
+  
+  C_AT_LoadSize *c = new C_AT_LoadSize(this, mds);
+  mds->filer->read(table_inode,
+                   0, sizeof(size_t),
+                   &c->bl,
+                   c);
+}
+
+void AnchorTable::load_2(size_t size, bufferlist& bl)
+{
+  // num
+  int off = 0;
+  int num;
+  bl.copy(0, sizeof(num), (char*)&num);
+  off += sizeof(num);
+  
+  // parse anchors
+  for (int i=0; i<num; i++) {
+    Anchor *a = new Anchor;
+    a->_decode(bl, off);
+    dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl;
+    anchor_map[a->ino] = a;
+  }
+
+  dout(7) << "load_2 got " << num << " anchors" << endl;
+
+  opened = true;
+  opening = false;
+
+  // finish
+  finish_contexts(waiting_for_open);
+}
+
diff --git a/branches/sage/cephmds2/mds/AnchorTable.h b/branches/sage/cephmds2/mds/AnchorTable.h
new file mode 100644
index 0000000000000..2e6c1d7b07788
--- /dev/null
+++ b/branches/sage/cephmds2/mds/AnchorTable.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __ANCHORTABLE_H
+#define __ANCHORTABLE_H
+
+#include "Anchor.h"
+#include "include/Context.h"
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+class MDS;
+
+
+class AnchorTable {
+  MDS *mds;
+  hash_map<inodeno_t, Anchor*>  anchor_map;
+
+  bool opening, opened;
+  list<Context*> waiting_for_open;
+
+ public:
+  inode_t table_inode;
+
+ public:
+  AnchorTable(MDS *mds); 
+
+ protected:
+  // 
+  bool have_ino(inodeno_t ino) { 
+    return true;                  // always in memory for now.
+  } 
+  void fetch_ino(inodeno_t ino, Context *onfinish) {
+    assert(!opened);
+    load(onfinish);
+  }
+
+  // adjust table
+  bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn);
+  void inc(inodeno_t ino);
+  void dec(inodeno_t ino);
+
+  
+  // high level interface
+  void lookup(inodeno_t ino, vector<Anchor*>& trace);
+  void create(inodeno_t ino, vector<Anchor*>& trace);
+  void destroy(inodeno_t ino);
+
+  // messages
+ public:
+  void dispatch(class Message *m);
+ protected:
+  void handle_anchor_request(class MAnchorRequest *m);  
+
+
+ public:
+
+  // load/save entire table for now!
+  void reset() {
+    opened = true;
+    anchor_map.clear();
+  }
+  void save(Context *onfinish);
+  void load(Context *onfinish);
+  void load_2(size_t size, bufferlist& bl);
+
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/CDentry.cc b/branches/sage/cephmds2/mds/CDentry.cc
new file mode 100644
index 0000000000000..2cfbbd80b58be
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CDentry.cc
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "CDentry.h"
+#include "CInode.h"
+#include "CDir.h"
+
+#include <cassert>
+
+#undef dout
+#define dout(x) if ((x) <= g_conf.debug) cout << "mds.dentry "
+
+
+// CDentry
+
+ostream& operator<<(ostream& out, CDentry& dn)
+{
+  out << "[dentry " << dn.get_name();
+  if (dn.is_pinned()) out << " " << dn.num_pins() << " pins";
+  
+  if (dn.is_null()) out << " NULL";
+  if (dn.is_remote()) out << " REMOTE";
+
+  if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning";
+  if (dn.is_dirty()) out << " dirty";
+  if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set();
+  if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby();
+
+  out << " dirv=" << dn.get_parent_dir_version();
+
+  out << " inode=" << dn.get_inode();
+  out << " " << &dn;
+  out << " in " << *dn.get_dir();
+  out << "]";
+  return out;
+}
+
+CDentry::CDentry(const CDentry& m) {
+  assert(0); //std::cerr << "copy cons called, implement me" << endl;
+}
+
+
+void CDentry::mark_dirty() 
+{
+  dout(10) << " mark_dirty " << *this << endl;
+
+  // dir is now dirty (if it wasn't already)
+  dir->mark_dirty();
+
+  // pin inode?
+  if (is_primary() && !dirty && inode) inode->get(CINODE_PIN_DNDIRTY);
+    
+  // i now live in that (potentially newly dirty) version
+  parent_dir_version = dir->get_version();
+
+  dirty = true;
+}
+void CDentry::mark_clean() {
+  dout(10) << " mark_clean " << *this << endl;
+  assert(dirty);
+  assert(parent_dir_version <= dir->get_version());
+
+  if (parent_dir_version < dir->get_last_committed_version())
+    cerr << " bad mark_clean " << *this << endl;    
+
+  assert(parent_dir_version >= dir->get_last_committed_version());
+
+  if (is_primary() && dirty && inode) inode->put(CINODE_PIN_DNDIRTY);
+  dirty = false;
+}    
+
+
+void CDentry::make_path(string& s)
+{
+  if (dir->inode->get_parent_dn()) 
+    dir->inode->get_parent_dn()->make_path(s);
+
+  s += "/";
+  s += name;
+}
+
+
+void CDentry::link_remote(CInode *in)
+{
+  assert(is_remote());
+  assert(in->ino() == remote_ino);
+
+  inode = in;
+  in->add_remote_parent(this);
+}
+
+void CDentry::unlink_remote()
+{
+  assert(is_remote());
+  assert(inode);
+  
+  inode->remove_remote_parent(this);
+  inode = 0;
+}
+
+
+
+
+
+// =
+const CDentry& CDentry::operator= (const CDentry& right) {
+  assert(0); //std::cerr << "copy op called, implement me" << endl;
+  return *this;
+}
+
+  // comparisons
+  bool CDentry::operator== (const CDentry& right) const {
+    return name == right.name;
+  }
+  bool CDentry::operator!= (const CDentry& right) const {
+    return name == right.name;
+  }
+  bool CDentry::operator< (const CDentry& right) const {
+    return name < right.name;
+  }
+  bool CDentry::operator> (const CDentry& right) const {
+    return name > right.name;
+  }
+  bool CDentry::operator>= (const CDentry& right) const {
+    return name >= right.name;
+  }
+  bool CDentry::operator<= (const CDentry& right) const {
+    return name <= right.name;
+  }
diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h
new file mode 100644
index 0000000000000..a399ef7acfe5a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CDentry.h
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __CDENTRY_H
+#define __CDENTRY_H
+
+#include <assert.h>
+#include <string>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+
+class CInode;
+class CDir;
+
+#define DN_LOCK_SYNC      0
+#define DN_LOCK_PREXLOCK  1
+#define DN_LOCK_XLOCK     2
+#define DN_LOCK_UNPINNING 3  // waiting for pins to go away
+
+#define DN_XLOCK_FOREIGN  ((Message*)0x1)  // not 0, not a valid pointer.
+
+class Message;
+
+// dentry
+class CDentry {
+ protected:
+  string          name;
+  CInode         *inode;
+  CDir           *dir;
+
+  inodeno_t       remote_ino;      // if remote dentry
+
+  // state
+  bool            dirty;
+  version_t       parent_dir_version;  // dir version when last touched.
+
+  // locking
+  int            lockstate;
+  Message        *xlockedby;
+  set<int>       gather_set;
+  
+  int            npins;
+  multiset<Message*> pinset;
+
+  friend class Migrator;
+  friend class Locker;
+  friend class Renamer;
+  friend class Server;
+  friend class MDCache;
+  friend class MDS;
+  friend class CInode;
+  friend class C_MDC_XlockRequest;
+
+ public:
+  // cons
+  CDentry() :
+    inode(0),
+    dir(0),
+    remote_ino(0),
+    dirty(0),
+    parent_dir_version(0),
+    lockstate(DN_LOCK_SYNC),
+    xlockedby(0),
+    npins(0) { }
+  CDentry(const string& n, inodeno_t ino, CInode *in=0) :
+    name(n),
+    inode(in),
+    dir(0),
+    remote_ino(ino),
+    dirty(0),
+    parent_dir_version(0),
+    lockstate(DN_LOCK_SYNC),
+    xlockedby(0),
+    npins(0) { }
+  CDentry(const string& n, CInode *in) :
+    name(n),
+    inode(in),
+    dir(0),
+    remote_ino(0),
+    dirty(0),
+    parent_dir_version(0),
+    lockstate(DN_LOCK_SYNC),
+    xlockedby(0),
+    npins(0) { }
+
+  CInode *get_inode() { return inode; }
+  CDir *get_dir() { return dir; }
+  const string& get_name() { return name; }
+  inodeno_t get_remote_ino() { return remote_ino; }
+
+  void set_remote_ino(inodeno_t ino) { remote_ino = ino; }
+
+  // dentry type is primary || remote || null
+  // inode ptr is required for primary, optional for remote, undefined for null
+  bool is_primary() { return remote_ino == 0 && inode != 0; }
+  bool is_remote() { return remote_ino > 0; }
+  bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; }
+
+  // remote links
+  void link_remote(CInode *in);
+  void unlink_remote();
+  
+
+  // copy cons
+  CDentry(const CDentry& m);
+  const CDentry& operator= (const CDentry& right);
+
+  // comparisons
+  bool operator== (const CDentry& right) const;
+  bool operator!= (const CDentry& right) const;
+  bool operator< (const CDentry& right) const;
+  bool operator> (const CDentry& right) const;
+  bool operator>= (const CDentry& right) const;
+  bool operator<= (const CDentry& right) const;
+
+  // misc
+  void make_path(string& p);
+
+  // -- state
+  __uint64_t get_parent_dir_version() { return parent_dir_version; }
+  void float_parent_dir_version(__uint64_t ge) {
+    if (parent_dir_version < ge)
+      parent_dir_version = ge;
+  }
+  
+  bool is_dirty() { return dirty; }
+  bool is_clean() { return !dirty; }
+
+  void mark_dirty();
+  void mark_clean();
+
+
+  // -- locking
+  int get_lockstate() { return lockstate; }
+  set<int>& get_gather_set() { return gather_set; }
+
+  bool is_sync() { return lockstate == DN_LOCK_SYNC; }
+  bool can_read()  { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING);  }
+  bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); }
+  bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; }
+  Message* get_xlockedby() { return xlockedby; } 
+  bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; }
+  bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; }
+  bool is_prexlockbyother(Message *m) {
+    return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby;
+  }
+  
+  // pins
+  void pin(Message *m) { 
+    npins++; 
+    pinset.insert(m);
+    assert(pinset.size() == (unsigned)npins);
+  }
+  void unpin(Message *m) { 
+    npins--; 
+    assert(npins >= 0); 
+    assert(pinset.count(m) > 0);
+    pinset.erase(pinset.find(m));
+    assert(pinset.size() == (unsigned)npins);
+  }
+  bool is_pinnable(Message *m) { 
+    return (lockstate == DN_LOCK_SYNC) ||
+      (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); 
+  }
+  bool is_pinned() { return npins>0; }
+  int num_pins() { return npins; }
+
+  friend class CDir;
+};
+
+ostream& operator<<(ostream& out, CDentry& dn);
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/CDir.cc b/branches/sage/cephmds2/mds/CDir.cc
new file mode 100644
index 0000000000000..a590e6821e1de
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CDir.cc
@@ -0,0 +1,914 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+
+#include "MDS.h"
+#include "MDSMap.h"
+
+#include "include/Context.h"
+#include "common/Clock.h"
+
+#include <cassert>
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") "
+
+
+// PINS
+int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+static char* cdir_pin_names[CDIR_NUM_PINS] = {
+  "child",
+  "opened",
+  "waiter",
+  "import",
+  "export",
+  "freeze",
+  "proxy",
+  "authpin",
+  "imping",
+  "impex",
+  "hashed",
+  "hashing",
+  "dirty",
+  "reqpins"
+};
+
+
+ostream& operator<<(ostream& out, CDir& dir)
+{
+  string path;
+  dir.get_inode()->make_path(path);
+  out << "[dir " << dir.ino() << " " << path << "/";
+  if (dir.is_dirty()) out << " dirty";
+  if (dir.is_import()) out << " import";
+  if (dir.is_export()) out << " export";
+  if (dir.is_rep()) out << " repl";
+  if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed;
+  if (dir.is_auth()) {
+    out << " auth";
+    if (dir.is_open_by_anyone())
+      out << "+" << dir.get_open_by();
+  } else {
+    out << " rep@" << dir.authority();
+    if (dir.get_replica_nonce() > 1)
+      out << "." << dir.get_replica_nonce();
+  }
+
+  if (dir.is_pinned()) {
+    out << " |";
+    for(set<int>::iterator it = dir.get_ref_set().begin();
+        it != dir.get_ref_set().end();
+        it++)
+      if (*it < CDIR_NUM_PINS)
+        out << " " << cdir_pin_names[*it];
+      else
+        out << " " << *it;
+  }
+
+  if (dir.get_dir_auth() != CDIR_AUTH_PARENT)
+    out << " dir_auth=" << dir.get_dir_auth();
+
+  out << " state=" << dir.get_state();
+  out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
+
+  out << " v=" << dir.get_version();
+  out << " cv=" << dir.get_committing_version();
+  out << " lastcv=" << dir.get_last_committed_version();
+
+  out << " " << &dir;
+  return out << "]";
+}
+
+
+// -------------------------------------------------------------------
+// CDir
+
+CDir::CDir(CInode *in, MDS *mds, bool auth)
+{
+  inode = in;
+  this->mds = mds;
+  
+  nitems = 0;
+  nnull = 0;
+  state = CDIR_STATE_INITIAL;
+
+  version = 0;
+  committing_version = 0;
+  last_committed_version = 0;
+
+  ref = 0;
+
+  // auth
+  dir_auth = -1;
+  assert(in->is_dir());
+  if (auth) 
+    state |= CDIR_STATE_AUTH;
+  /*
+  if (in->dir_is_hashed()) {
+    assert(0);                      // when does this happen?  
+    state |= CDIR_STATE_HASHED;
+  }
+  */
+ 
+  auth_pins = 0;
+  nested_auth_pins = 0;
+  request_pins = 0;
+  
+  dir_rep = CDIR_REP_NONE;
+}
+
+
+
+
+/***
+ * linking fun
+ */
+
+CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) 
+{
+  // foreign
+  assert(lookup(dname) == 0);
+  
+  // create dentry
+  CDentry* dn = new CDentry(dname, ino);
+  dn->dir = this;
+  dn->parent_dir_version = version;
+  
+  // add to dir
+  assert(items.count(dn->name) == 0);
+  assert(null_items.count(dn->name) == 0);
+
+  items[dn->name] = dn;
+  nitems++;
+
+  dout(12) << "add_dentry " << *dn << endl;
+
+  // pin?
+  if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+  
+  assert(nnull + nitems == items.size());
+  assert(nnull == null_items.size());         
+  return dn;
+}
+
+
+CDentry* CDir::add_dentry( const string& dname, CInode *in ) 
+{
+  // primary
+  assert(lookup(dname) == 0);
+  
+  // create dentry
+  CDentry* dn = new CDentry(dname, in);
+  dn->dir = this;
+  dn->parent_dir_version = version;
+  
+  // add to dir
+  assert(items.count(dn->name) == 0);
+  assert(null_items.count(dn->name) == 0);
+
+  items[dn->name] = dn;
+  
+  if (in) {
+    link_inode_work( dn, in );
+  } else {
+    assert(dn->inode == 0);
+    null_items[dn->name] = dn;
+    nnull++;
+  }
+
+  dout(12) << "add_dentry " << *dn << endl;
+
+  // pin?
+  if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+  
+  assert(nnull + nitems == items.size());
+  assert(nnull == null_items.size());         
+  return dn;
+}
+
+
+
+void CDir::remove_dentry(CDentry *dn) 
+{
+  dout(12) << "remove_dentry " << *dn << endl;
+
+  if (dn->inode) {
+    // detach inode and dentry
+    unlink_inode_work(dn);
+  } else {
+    // remove from null list
+     assert(null_items.count(dn->name) == 1);
+    null_items.erase(dn->name);
+    nnull--;
+  }
+  
+  // remove from list
+  assert(items.count(dn->name) == 1);
+  items.erase(dn->name);
+
+  delete dn;
+
+  // unpin?
+  if (nnull + nitems == 0) put(CDIR_PIN_CHILD);
+
+  assert(nnull + nitems == items.size());
+  assert(nnull == null_items.size());         
+}
+
+void CDir::link_inode( CDentry *dn, inodeno_t ino)
+{
+  //dout(12) << "link_inode " << *dn << " remote " << ino << endl;
+
+  assert(dn->is_null());
+  dn->set_remote_ino(ino);
+  nitems++;
+
+  assert(null_items.count(dn->name) == 1);
+  null_items.erase(dn->name);
+  nnull--;
+}
+
+void CDir::link_inode( CDentry *dn, CInode *in )
+{
+  assert(!dn->is_remote());
+
+  link_inode_work(dn,in);
+  //dout(12) << "link_inode " << *dn << " " << *in << endl;
+  
+  // remove from null list
+  assert(null_items.count(dn->name) == 1);
+  null_items.erase(dn->name);
+  nnull--;
+
+  assert(nnull + nitems == items.size());
+  assert(nnull == null_items.size());         
+}
+
+void CDir::link_inode_work( CDentry *dn, CInode *in )
+{
+  dn->inode = in;
+  in->set_primary_parent(dn);
+
+  nitems++;  // adjust dir size
+  
+  // set dir version
+  in->parent_dir_version = get_version();
+  
+  // clear dangling
+  in->state_clear(CINODE_STATE_DANGLING);
+
+  // dn dirty?
+  if (dn->is_dirty()) in->get(CINODE_PIN_DNDIRTY);
+
+  // adjust auth pin count
+  if (in->auth_pins + in->nested_auth_pins)
+    adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins );
+}
+
+void CDir::unlink_inode( CDentry *dn )
+{
+  dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl;
+
+  unlink_inode_work(dn);
+
+  // add to null list
+  assert(null_items.count(dn->name) == 0);
+  null_items[dn->name] = dn;
+  nnull++;
+
+  assert(nnull + nitems == items.size());
+  assert(nnull == null_items.size());         
+}
+
+void CDir::unlink_inode_work( CDentry *dn )
+{
+  CInode *in = dn->inode;
+ 
+  if (dn->is_remote()) {
+    // remote
+    if (in) 
+      dn->unlink_remote();
+
+    dn->set_remote_ino(0);
+  } else {
+    // primary
+    assert(dn->is_primary());
+ 
+    // explicitly define auth
+    in->dangling_auth = in->authority();
+    //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl;
+    
+    // unlink auth_pin count
+    if (in->auth_pins + in->nested_auth_pins)
+      adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) );
+    
+    // set dangling flag
+    in->state_set(CINODE_STATE_DANGLING);
+
+    // dn dirty?
+    if (dn->is_dirty()) in->put(CINODE_PIN_DNDIRTY);
+    
+    // detach inode
+    in->remove_primary_parent(dn);
+    dn->inode = 0;
+  }
+
+  nitems--;   // adjust dir size
+}
+
+void CDir::remove_null_dentries() {
+  dout(12) << "remove_null_dentries " << *this << endl;
+
+  list<CDentry*> dns;
+  for (CDir_map_t::iterator it = null_items.begin();
+       it != null_items.end(); 
+       it++) {
+    dns.push_back(it->second);
+  }
+
+  for (list<CDentry*>::iterator it = dns.begin();
+       it != dns.end();
+       it++) {
+    CDentry *dn = *it;
+    assert(dn->is_sync());
+    remove_dentry(dn);
+  }
+  assert(null_items.empty());         
+  assert(nnull == 0);
+  assert(nnull + nitems == items.size());
+}
+
+
+
+/****************************************
+ * WAITING
+ */
+
+bool CDir::waiting_for(int tag)
+{
+  return waiting.count(tag) > 0;
+}
+
+bool CDir::waiting_for(int tag, const string& dn)
+{
+  if (!waiting_on_dentry.count(dn)) 
+    return false;
+  return waiting_on_dentry[dn].count(tag) > 0;
+}
+
+void CDir::add_waiter(int tag,
+                      const string& dentry,
+                      Context *c) {
+  if (waiting.empty() && waiting_on_dentry.size() == 0)
+    get(CDIR_PIN_WAITER);
+  waiting_on_dentry[ dentry ].insert(pair<int,Context*>(tag,c));
+  dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl;
+}
+
+void CDir::add_waiter(int tag, Context *c) {
+  // hierarchical?
+  if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) {  
+    if (is_freezing_tree_root() || is_frozen_tree_root() ||
+        is_freezing_dir() || is_frozen_dir()) {
+      // it's us, pin here.  (fall thru)
+    } else {
+      // pin parent!
+      dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl;
+      inode->parent->dir->add_waiter(tag, c);
+      return;
+    }
+  }
+
+  // this dir.
+  if (waiting.empty() && waiting_on_dentry.size() == 0)
+    get(CDIR_PIN_WAITER);
+  waiting.insert(pair<int,Context*>(tag,c));
+  dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
+}
+
+
+void CDir::take_waiting(int mask, 
+                        const string& dentry,
+                        list<Context*>& ls,
+                        int num)
+{
+  if (waiting_on_dentry.empty()) return;
+  
+  multimap<int,Context*>::iterator it = waiting_on_dentry[dentry].begin();
+  while (it != waiting_on_dentry[dentry].end()) {
+    if (it->first & mask) {
+      ls.push_back(it->second);
+      dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+      waiting_on_dentry[dentry].erase(it++);
+
+      if (num) {
+        if (num == 1) break;
+        num--;
+      }
+    } else {
+      dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl;
+      it++;
+    }
+  }
+
+  // did we clear dentry?
+  if (waiting_on_dentry[dentry].empty())
+    waiting_on_dentry.erase(dentry);
+  
+  // ...whole map?
+  if (waiting_on_dentry.size() == 0 && waiting.empty())
+    put(CDIR_PIN_WAITER);
+}
+
+/* NOTE: this checks dentry waiters too */
+void CDir::take_waiting(int mask,
+                        list<Context*>& ls)
+{
+  if (waiting_on_dentry.size()) {
+    // try each dentry
+    hash_map<string, multimap<int,Context*> >::iterator it = 
+      waiting_on_dentry.begin(); 
+    while (it != waiting_on_dentry.end()) {
+      take_waiting(mask, (it++)->first, ls);   // not post-inc
+    }
+  }
+  
+  // waiting
+  if (!waiting.empty()) {
+    multimap<int,Context*>::iterator it = waiting.begin();
+    while (it != waiting.end()) {
+      if (it->first & mask) {
+        ls.push_back(it->second);
+        dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+        waiting.erase(it++);
+      } else {
+        dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl;
+        it++;
+      }
+    }
+    
+    if (waiting_on_dentry.size() == 0 && waiting.empty())
+      put(CDIR_PIN_WAITER);
+  }
+}
+
+
+void CDir::finish_waiting(int mask, int result) 
+{
+  dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl;
+
+  list<Context*> finished;
+  take_waiting(mask, finished);
+  finish_contexts(finished, result);
+}
+
+void CDir::finish_waiting(int mask, const string& dn, int result) 
+{
+  dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl;
+
+  list<Context*> finished;
+  take_waiting(mask, dn, finished);
+  finish_contexts(finished, result);
+}
+
+
+// dirty/clean
+
+void CDir::mark_dirty()
+{
+  if (!state_test(CDIR_STATE_DIRTY)) {
+    version++;
+    state_set(CDIR_STATE_DIRTY);
+    dout(10) << "mark_dirty (was clean) " << *this << " new version " << version << endl;
+    get(CDIR_PIN_DIRTY);
+  } 
+  else if (state_test(CDIR_STATE_COMMITTING) &&
+           committing_version == version) {
+    version++;  // now dirtier than committing version!
+    dout(10) << "mark_dirty (committing) " << *this << " new version " << version << "/" << committing_version <<  endl;
+  } else {
+    dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl;
+  }
+}
+
+void CDir::mark_clean()
+{
+  dout(10) << "mark_clean " << *this << " version " << version << endl;
+  if (state_test(CDIR_STATE_DIRTY)) {
+    state_clear(CDIR_STATE_DIRTY);
+    put(CDIR_PIN_DIRTY);
+  }
+}
+
+
+
+// ref counts
+
+void CDir::put(int by) {
+  cdir_pins[by]--;
+
+  // bad?
+  if (ref == 0 || ref_set.count(by) != 1) {
+    dout(7) << *this << " bad put by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+    assert(ref_set.count(by) == 1);
+    assert(ref > 0);
+  }
+
+  ref--;
+  ref_set.erase(by);
+
+  // inode
+  if (ref == 0)
+    inode->put(CINODE_PIN_DIR);
+
+  dout(7) << *this << " put by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+}
+
+void CDir::get(int by) {
+  cdir_pins[by]++;
+
+  // inode
+  if (ref == 0)
+    inode->get(CINODE_PIN_DIR);
+
+  // bad?
+  if (ref_set.count(by)) {
+    dout(7) << *this << " bad get by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+    assert(ref_set.count(by) == 0);
+  }
+  
+  ref++;
+  ref_set.insert(by);
+  
+  dout(7) << *this << " get by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+}
+
+
+
+/********************************
+ * AUTHORITY
+ */
+
+/*
+ * simple rule: if dir_auth isn't explicit, auth is the same as the inode.
+ */
+int CDir::authority() 
+{
+  if (get_dir_auth() >= 0)
+    return get_dir_auth();
+
+  /*
+  CDir *parent = inode->get_parent_dir();
+  if (parent)
+    return parent->authority();
+  
+  // root, or dangling
+  assert(inode->is_root());  // no dirs under danglers!?
+  //assert(inode->is_root() || inode->is_dangling());  
+  */
+
+  return inode->authority();
+}
+
+int CDir::dentry_authority(const string& dn )
+{
+  // hashing -- subset of nodes have hashed the contents
+  if (is_hashing() && !hashed_subset.empty()) {
+    int hashauth = mds->hash_dentry( inode->ino(), dn );  // hashed
+    if (hashed_subset.count(hashauth))
+      return hashauth;
+  }
+
+  // hashed
+  if (is_hashed()) {
+    return mds->hash_dentry( inode->ino(), dn );  // hashed
+  }
+  
+  if (get_dir_auth() == CDIR_AUTH_PARENT) {
+    //dout(15) << "dir_auth = parent at " << *this << endl;
+    return inode->authority();       // same as my inode
+  }
+
+  // it's explicit for this whole dir
+  //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl;
+  return get_dir_auth();
+}
+
+void CDir::set_dir_auth(int d) 
+{ 
+  dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl;
+  dir_auth = d; 
+}
+
+
+/*****************************************
+ * AUTH PINS
+ */
+
+void CDir::auth_pin() {
+  if (auth_pins == 0)
+    get(CDIR_PIN_AUTHPIN);
+  auth_pins++;
+
+  dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+  
+  inode->nested_auth_pins++;
+  if (inode->parent)
+    inode->parent->dir->adjust_nested_auth_pins( 1 );
+}
+
+void CDir::auth_unpin() {
+  auth_pins--;
+  if (auth_pins == 0)
+    put(CDIR_PIN_AUTHPIN);
+
+  dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+  assert(auth_pins >= 0);
+  
+  // pending freeze?
+  if (auth_pins + nested_auth_pins == 0)
+    on_freezeable();
+  
+  inode->nested_auth_pins--;
+  if (inode->parent)
+    inode->parent->dir->adjust_nested_auth_pins( -1 );
+}
+
+void CDir::adjust_nested_auth_pins(int inc) 
+{
+  CDir *dir = this;
+  
+  while (1) {
+    // dir
+    dir->nested_auth_pins += inc;
+    
+    dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl;
+    assert(dir->nested_auth_pins >= 0);
+    
+    // pending freeze?
+    if (dir->auth_pins + dir->nested_auth_pins == 0) 
+      dir->on_freezeable();
+    
+    // it's inode
+    dir->inode->nested_auth_pins += inc;
+    
+    if (dir->inode->parent)
+      dir = dir->inode->parent->dir;
+    else
+      break;
+  }
+}
+
+
+
+/*****************************************************************************
+ * FREEZING
+ */
+
+void CDir::on_freezeable()
+{
+  // check for anything pending freezeable
+
+  /* NOTE: the first of these will likely freeze the dir, and unmark
+     FREEZING.  additional ones will re-flag FREEZING.  this isn't
+     particularly graceful, and might cause problems if the first one
+     needs to know about other waiters.... FIXME? */
+  
+  finish_waiting(CDIR_WAIT_FREEZEABLE);
+}
+
+// FREEZE TREE
+
+class C_MDS_FreezeTree : public Context {
+  CDir *dir;
+  Context *con;
+public:
+  C_MDS_FreezeTree(CDir *dir, Context *c) {
+    this->dir = dir;
+    this->con = c;
+  }
+  virtual void finish(int r) {
+    dir->freeze_tree_finish(con);
+  }
+};
+
+void CDir::freeze_tree(Context *c)
+{
+  assert(!is_frozen());
+  assert(!is_freezing());
+  
+  if (is_freezeable()) {
+    dout(10) << "freeze_tree " << *this << endl;
+    
+    state_set(CDIR_STATE_FROZENTREE);
+    inode->auth_pin();  // auth_pin for duration of freeze
+    
+    // easy, we're frozen
+    c->finish(0);
+    delete c;
+    
+  } else {
+    state_set(CDIR_STATE_FREEZINGTREE);
+    dout(10) << "freeze_tree + wait " << *this << endl;
+    
+    // need to wait for auth pins to expire
+    add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
+  } 
+}
+
+void CDir::freeze_tree_finish(Context *c)
+{
+  // freezeable now?
+  if (!is_freezeable()) {
+    // wait again!
+    dout(10) << "freeze_tree_finish still waiting " << *this << endl;
+    state_set(CDIR_STATE_FREEZINGTREE);
+    add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
+    return;
+  }
+
+  dout(10) << "freeze_tree_finish " << *this << endl;
+  state_set(CDIR_STATE_FROZENTREE);
+  state_clear(CDIR_STATE_FREEZINGTREE);   // actually, this may get set again by next context?
+
+  inode->auth_pin();  // auth_pin for duration of freeze
+  
+  // continue to frozen land
+  if (c) {
+    c->finish(0);
+    delete c;
+  }
+}
+
+void CDir::unfreeze_tree()
+{
+  dout(10) << "unfreeze_tree " << *this << endl;
+  state_clear(CDIR_STATE_FROZENTREE);
+  
+  // unpin  (may => FREEZEABLE)   FIXME: is this order good?
+  inode->auth_unpin();
+
+  // waiters?
+  finish_waiting(CDIR_WAIT_UNFREEZE);
+}
+
+bool CDir::is_freezing_tree()
+{
+  CDir *dir = this;
+  while (1) {
+    if (dir->is_freezing_tree_root()) return true;
+    if (dir->is_import()) return false;
+    if (dir->is_hashed()) return false;
+    if (dir->inode->parent)
+      dir = dir->inode->parent->dir;
+    else
+      return false; // root on replica
+  }
+}
+
+bool CDir::is_frozen_tree()
+{
+  CDir *dir = this;
+  while (1) {
+    if (dir->is_frozen_tree_root()) return true;
+    if (dir->is_import()) return false;
+    if (dir->is_hashed()) return false;
+    if (dir->inode->parent)
+      dir = dir->inode->parent->dir;
+    else
+      return false;  // root on replica
+  }
+}
+
+
+
+// FREEZE DIR
+
+class C_MDS_FreezeDir : public Context {
+  CDir *dir;
+  Context *con;
+public:
+  C_MDS_FreezeDir(CDir *dir, Context *c) {
+    this->dir = dir;
+    this->con = c;
+  }
+  virtual void finish(int r) {
+    dir->freeze_dir_finish(con);
+  }
+};
+
+void CDir::freeze_dir(Context *c)
+{
+  assert(!is_frozen());
+  assert(!is_freezing());
+  
+  if (is_freezeable_dir()) {
+    dout(10) << "freeze_dir " << *this << endl;
+    
+    state_set(CDIR_STATE_FROZENDIR);
+    inode->auth_pin();  // auth_pin for duration of freeze
+    
+    // easy, we're frozen
+    c->finish(0);
+    delete c;
+    
+  } else {
+    state_set(CDIR_STATE_FREEZINGDIR);
+    dout(10) << "freeze_dir + wait " << *this << endl;
+    
+    // need to wait for auth pins to expire
+    add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
+  } 
+}
+
+void CDir::freeze_dir_finish(Context *c)
+{
+  // freezeable now?
+  if (!is_freezeable_dir()) {
+    // wait again!
+    dout(10) << "freeze_dir_finish still waiting " << *this << endl;
+    state_set(CDIR_STATE_FREEZINGDIR);
+    add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
+    return;
+  }
+
+  dout(10) << "freeze_dir_finish " << *this << endl;
+  state_set(CDIR_STATE_FROZENDIR);
+  state_clear(CDIR_STATE_FREEZINGDIR);   // actually, this may get set again by next context?
+  
+  inode->auth_pin();  // auth_pin for duration of freeze
+  
+  // continue to frozen land
+  if (c) {
+    c->finish(0);
+    delete c;
+  }
+}
+
+void CDir::unfreeze_dir()
+{
+  dout(10) << "unfreeze_dir " << *this << endl;
+  state_clear(CDIR_STATE_FROZENDIR);
+  
+  // unpin  (may => FREEZEABLE)   FIXME: is this order good?
+  inode->auth_unpin();
+
+  // waiters?
+  finish_waiting(CDIR_WAIT_UNFREEZE);
+}
+
+
+
+
+
+
+
+
+
+// -----------------------------------------------------------------
+// debug shite
+
+
+void CDir::dump(int depth) {
+  string ind(depth, '\t');
+
+  dout(10) << "dump:" << ind << *this << endl;
+
+  map<string,CDentry*>::iterator iter = items.begin();
+  while (iter != items.end()) {
+    CDentry* d = iter->second;
+    if (d->inode) {
+      char isdir = ' ';
+      if (d->inode->dir != NULL) isdir = '/';
+      dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl;
+      d->inode->dump(depth+1);
+    } else {
+      dout(10) << "dump: " << ind << *d << " = [null]" << endl;
+    }
+    iter++;
+  }
+
+  if (!(state_test(CDIR_STATE_COMPLETE)))
+    dout(10) << ind << "..." << endl;
+  if (state_test(CDIR_STATE_DIRTY))
+    dout(10) << ind << "[dirty]" << endl;
+
+}
+
diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h
new file mode 100644
index 0000000000000..a1e857a72f9f9
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CDir.h
@@ -0,0 +1,706 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __CDIR_H
+#define __CDIR_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "config.h"
+#include "common/DecayCounter.h"
+
+#include <iostream>
+#include <cassert>
+
+#include <list>
+#include <set>
+#include <map>
+#include <string>
+using namespace std;
+
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+
+
+#include "CInode.h"
+
+class CDentry;
+class MDS;
+class MDCluster;
+class Context;
+
+
+// directory authority types
+//  >= 0 is the auth mds
+#define CDIR_AUTH_PARENT   -1   // default
+
+
+#define CDIR_NONCE_EXPORT   1
+
+
+// state bits
+#define CDIR_STATE_AUTH          (1<<0)   // auth for this dir (hashing doesn't count)
+#define CDIR_STATE_PROXY         (1<<1)   // proxy auth
+
+#define CDIR_STATE_COMPLETE      (1<<2)   // the complete contents are in cache
+#define CDIR_STATE_DIRTY         (1<<3)   // has been modified since last commit
+
+#define CDIR_STATE_FROZENTREE    (1<<4)   // root of tree (bounded by exports)
+#define CDIR_STATE_FREEZINGTREE  (1<<5)   // in process of freezing 
+#define CDIR_STATE_FROZENDIR     (1<<6)
+#define CDIR_STATE_FREEZINGDIR   (1<<7)
+
+#define CDIR_STATE_COMMITTING    (1<<8)   // mid-commit
+#define CDIR_STATE_FETCHING      (1<<9)   // currenting fetching
+
+#define CDIR_STATE_DELETED       (1<<10)
+
+#define CDIR_STATE_IMPORT           (1<<11)   // flag set if this is an import.
+#define CDIR_STATE_EXPORT           (1<<12)
+#define CDIR_STATE_IMPORTINGEXPORT  (1<<13)
+
+#define CDIR_STATE_HASHED           (1<<14)   // if hashed
+#define CDIR_STATE_HASHING          (1<<15)
+#define CDIR_STATE_UNHASHING        (1<<16)
+
+
+
+
+
+// these state bits are preserved by an import/export
+// ...except if the directory is hashed, in which case none of them are!
+#define CDIR_MASK_STATE_EXPORTED    (CDIR_STATE_COMPLETE\
+                                    |CDIR_STATE_DIRTY)  
+#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\
+                                    |CDIR_STATE_EXPORT\
+                                    |CDIR_STATE_IMPORTINGEXPORT)
+#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\
+                                    |CDIR_STATE_FROZENTREE\
+                                    |CDIR_STATE_FROZENDIR\
+                                    |CDIR_STATE_EXPORT\
+                                     |CDIR_STATE_PROXY)
+
+// common states
+#define CDIR_STATE_CLEAN   0
+#define CDIR_STATE_INITIAL 0  
+
+// directory replication
+#define CDIR_REP_ALL       1
+#define CDIR_REP_NONE      0
+#define CDIR_REP_LIST      2
+
+
+
+// pins
+
+#define CDIR_PIN_CHILD     0
+#define CDIR_PIN_OPENED    1  // open by another node
+#define CDIR_PIN_WAITER    2  // waiter(s)
+
+#define CDIR_PIN_IMPORT    3
+#define CDIR_PIN_EXPORT    4
+#define CDIR_PIN_FREEZE    5
+#define CDIR_PIN_PROXY     6  // auth just changed.
+
+#define CDIR_PIN_AUTHPIN   7
+
+#define CDIR_PIN_IMPORTING 8
+#define CDIR_PIN_IMPORTINGEXPORT 9
+
+#define CDIR_PIN_HASHED    10
+#define CDIR_PIN_HASHING   11
+#define CDIR_PIN_DIRTY     12
+
+#define CDIR_PIN_REQUEST   13
+
+#define CDIR_NUM_PINS      14
+
+
+
+// wait reasons
+#define CDIR_WAIT_DENTRY         1  // wait for item to be in cache
+     // waiters: path_traverse
+     // trigger: handle_discover, fetch_dir_2
+#define CDIR_WAIT_COMPLETE       2  // wait for complete dir contents
+     // waiters: fetch_dir, commit_dir
+     // trigger: fetch_dir_2
+#define CDIR_WAIT_FREEZEABLE     4  // hard_pins removed
+     // waiters: freeze, freeze_finish
+     // trigger: auth_unpin, adjust_nested_auth_pins
+#define CDIR_WAIT_UNFREEZE       8  // unfreeze
+     // waiters: path_traverse, handle_discover, handle_inode_update,
+     //           export_dir_frozen                                   (mdcache)
+     //          handle_client_readdir                                (mds)
+     // trigger: unfreeze
+#define CDIR_WAIT_AUTHPINNABLE  CDIR_WAIT_UNFREEZE
+    // waiters: commit_dir                                           (mdstore)
+    // trigger: (see CDIR_WAIT_UNFREEZE)
+#define CDIR_WAIT_COMMITTED     32  // did commit (who uses this?**)
+    // waiters: commit_dir (if already committing)
+    // trigger: commit_dir_2
+#define CDIR_WAIT_IMPORTED      64  // import finish
+    // waiters: import_dir_block
+    // triggers: handle_export_dir_finish
+
+#define CDIR_WAIT_EXPORTWARNING 8192    // on bystander.
+    // watiers: handle_export_dir_notify
+    // triggers: handle_export_dir_warning
+#define CDIR_WAIT_EXPORTPREPACK 16384
+    // waiter   export_dir
+    // trigger  handel_export_dir_prep_ack
+
+#define CDIR_WAIT_HASHED        (1<<17)  // hash finish
+#define CDIR_WAIT_THISHASHEDREADDIR (1<<18)  // current readdir lock
+#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19)  // after current readdir lock finishes
+
+#define CDIR_WAIT_DNREAD        (1<<20)
+#define CDIR_WAIT_DNLOCK        (1<<21)
+#define CDIR_WAIT_DNUNPINNED    (1<<22)
+#define CDIR_WAIT_DNPINNABLE    (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED)
+
+#define CDIR_WAIT_DNREQXLOCK    (1<<23)
+
+#define CDIR_WAIT_ANY   (0xffffffff)
+
+#define CDIR_WAIT_ATFREEZEROOT  (CDIR_WAIT_AUTHPINNABLE|\
+                                 CDIR_WAIT_UNFREEZE)      // hmm, same same
+
+
+ostream& operator<<(ostream& out, class CDir& dir);
+
+
+// CDir
+typedef map<string, CDentry*> CDir_map_t;
+
+
+extern int cdir_pins[CDIR_NUM_PINS];
+
+
+class CDir {
+ public:
+  CInode          *inode;
+
+ protected:
+  // contents
+  CDir_map_t       items;              // non-null AND null
+  CDir_map_t       null_items;        // null and foreign
+  size_t           nitems;             // non-null
+  size_t           nnull;              // null
+  //size_t           nauthitems;
+  //size_t           namesize;
+
+  // state
+  unsigned         state;
+  version_t       version;
+  version_t       committing_version;
+  version_t       last_committed_version;
+
+  // authority, replicas
+  set<int>         open_by;        // nodes that have me open
+  map<int,int>     open_by_nonce;
+  int              replica_nonce;
+  int              dir_auth;       
+
+  // reference countin/pins
+  int              ref;       // reference count
+  set<int>         ref_set;
+
+  // lock nesting, freeze
+  int        auth_pins;
+  int        nested_auth_pins;
+  int        request_pins;
+
+  // hashed dirs
+  set<int>   hashed_subset;  // HASHING: subset of mds's that are hashed
+ public:
+  // for class MDS
+  map<int, pair< list<class InodeStat*>, list<string> > > hashed_readdir;
+ protected:
+
+  // context
+  MDS              *mds;
+
+
+  // waiters
+  multimap<int, Context*> waiting;  // tag -> context
+  hash_map< string, multimap<int, Context*> >
+                          waiting_on_dentry;
+
+  // cache control  (defined for authority; hints for replicas)
+  int              dir_rep;
+  set<int>         dir_rep_by;      // if dir_rep == CDIR_REP_LIST
+
+  // popularity
+  meta_load_t popularity[MDS_NPOP];
+
+  // friends
+  friend class Migrator;
+  friend class CInode;
+  friend class MDCache;
+  friend class MDiscover;
+  friend class MDBalancer;
+
+  friend class CDirDiscover;
+  friend class CDirExport;
+
+ public:
+  CDir(CInode *in, MDS *mds, bool auth);
+
+
+
+  // -- accessors --
+  inodeno_t ino()        { return inode->ino(); }
+  CInode *get_inode()    { return inode; }
+  CDir *get_parent_dir() { return inode->get_parent_dir(); }
+
+  CDir_map_t::iterator begin() { return items.begin(); }
+  CDir_map_t::iterator end() { return items.end(); }
+  size_t get_size() { 
+    
+    //if ( is_auth() && !is_hashed()) assert(nauthitems == nitems);
+    //if (!is_auth() && !is_hashed()) assert(nauthitems == 0);
+    
+    return nitems; 
+  }
+  size_t get_nitems() { return nitems; }
+  size_t get_nnull() { return nnull; }
+  /*
+  size_t get_auth_size() { 
+    assert(nauthitems <= nitems);
+    return nauthitems; 
+  }
+  */
+
+  /*
+  float get_popularity() {
+    return popularity[0].get();
+  }
+  */
+  
+
+  // -- dentries and inodes --
+ public:
+  CDentry* lookup(const string& n) {
+    map<string,CDentry*>::iterator iter = items.find(n);
+    if (iter == items.end()) 
+      return 0;
+    else
+      return iter->second;
+  }
+
+  CDentry* add_dentry( const string& dname, CInode *in=0 );
+  CDentry* add_dentry( const string& dname, inodeno_t ino );
+  void remove_dentry( CDentry *dn );         // delete dentry
+  void link_inode( CDentry *dn, inodeno_t ino );
+  void link_inode( CDentry *dn, CInode *in );
+  void unlink_inode( CDentry *dn );
+ private:
+  void link_inode_work( CDentry *dn, CInode *in );
+  void unlink_inode_work( CDentry *dn );
+
+  void remove_null_dentries();  // on empty, clean dir
+
+  // -- authority --
+ public:
+  int authority();
+  int dentry_authority(const string& d);
+  int get_dir_auth() { return dir_auth; }
+  void set_dir_auth(int d);
+
+  bool is_open_by_anyone() { return !open_by.empty(); }
+  bool is_open_by(int mds) { return open_by.count(mds); }
+  int get_open_by_nonce(int mds) {
+    map<int,int>::iterator it = open_by_nonce.find(mds);
+    return it->second;
+  }
+  set<int>::iterator open_by_begin() { return open_by.begin(); }
+  set<int>::iterator open_by_end() { return open_by.end(); }
+  set<int>& get_open_by() { return open_by; }
+
+  int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
+  
+  int open_by_add(int mds) {
+    int nonce = 1;
+    
+    if (is_open_by(mds)) {    // already had it?
+      nonce = get_open_by_nonce(mds) + 1; // new nonce (+1)
+      dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
+      open_by_nonce.erase(mds);
+    } else {
+      if (open_by.empty()) 
+        get(CDIR_PIN_OPENED);
+      open_by.insert(mds);
+    }
+    open_by_nonce.insert(pair<int,int>(mds,nonce));   // first! serial of 1.
+    return nonce;   // default nonce
+  }
+  void open_by_remove(int mds) {
+    //if (!is_open_by(mds)) return;
+    assert(is_open_by(mds));
+
+    open_by.erase(mds);
+    open_by_nonce.erase(mds);
+    if (open_by.empty())
+      put(CDIR_PIN_OPENED);      
+  }
+  void open_by_clear() {
+    if (!open_by.empty())
+      put(CDIR_PIN_OPENED);
+    open_by.clear();
+    open_by_nonce.clear();
+  }
+
+  
+
+  // for giving to clients
+  void get_dist_spec(set<int>& ls, int auth) {
+    if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) {
+      //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
+      ls = open_by;
+      if (!ls.empty()) ls.insert(auth);
+    }
+  }
+
+
+  // -- state --
+  unsigned get_state() { return state; }
+  void reset_state(unsigned s) { 
+    state = s; 
+    dout(10) << " cdir:" << *this << " state reset" << endl;
+  }
+  void state_clear(unsigned mask) {    
+    state &= ~mask; 
+    dout(10) << " cdir:" << *this << " state -" << mask << " = " << state << endl;
+  }
+  void state_set(unsigned mask) { 
+    state |= mask; 
+    dout(10) << " cdir:" << *this << " state +" << mask << " = " << state << endl;
+  }
+  unsigned state_test(unsigned mask) { return state & mask; }
+
+  bool is_complete() { return state & CDIR_STATE_COMPLETE; }
+  bool is_dirty() { return state_test(CDIR_STATE_DIRTY); }
+
+  bool is_auth() { return state & CDIR_STATE_AUTH; }
+  bool is_proxy() { return state & CDIR_STATE_PROXY; }
+  bool is_import() { return state & CDIR_STATE_IMPORT; }
+  bool is_export() { return state & CDIR_STATE_EXPORT; }
+
+  bool is_hashed() { return state & CDIR_STATE_HASHED; }
+  bool is_hashing() { return state & CDIR_STATE_HASHING; }
+  bool is_unhashing() { return state & CDIR_STATE_UNHASHING; }
+
+  bool is_rep() { 
+    if (dir_rep == CDIR_REP_NONE) return false;
+    return true;
+  }
+ 
+
+
+  // -- dirtyness --
+  version_t get_version() { return version; }
+  void float_version(version_t ge) {
+    if (version < ge)
+      version = ge;
+  }
+  void set_version(version_t v) { version = v; }
+
+  version_t get_committing_version() { return committing_version; }
+  version_t get_last_committed_version() { return last_committed_version; }
+  // as in, we're committing the current version.
+  void set_committing_version() { committing_version = version; }
+  void set_last_committed_version(version_t v) { last_committed_version = v; }
+  void mark_dirty();
+  void mark_clean();
+  void mark_complete() { state_set(CDIR_STATE_COMPLETE); }
+  bool is_clean() { return !state_test(CDIR_STATE_DIRTY); }
+
+
+
+
+  // -- reference counting --
+  void put(int by);
+  void get(int by);
+  bool is_pinned_by(int by) {
+    return ref_set.count(by);
+  }
+  bool is_pinned() { return ref > 0; }
+  int get_ref() { return ref; }
+  set<int>& get_ref_set() { return ref_set; }
+  void request_pin_get() {
+    if (request_pins == 0) get(CDIR_PIN_REQUEST);
+    request_pins++;
+  }
+  void request_pin_put() {
+    request_pins--;
+    if (request_pins == 0) put(CDIR_PIN_REQUEST);
+  }
+
+    
+  // -- waiters --
+  bool waiting_for(int tag);
+  bool waiting_for(int tag, const string& dn);
+  void add_waiter(int tag, Context *c);
+  void add_waiter(int tag,
+                  const string& dentry,
+                  Context *c);
+  void take_waiting(int mask, list<Context*>& ls);  // includes dentry waiters
+  void take_waiting(int mask, 
+                    const string& dentry, 
+                    list<Context*>& ls,
+                    int num=0);  
+  void finish_waiting(int mask, int result = 0);    // ditto
+  void finish_waiting(int mask, const string& dn, int result = 0);    // ditto
+
+
+  // -- auth pins --
+  bool can_auth_pin() { return !(is_frozen() || is_freezing()); }
+  int is_auth_pinned() { return auth_pins; }
+  void auth_pin();
+  void auth_unpin();
+  void adjust_nested_auth_pins(int inc);
+  void on_freezeable();
+
+  // -- freezing --
+  void freeze_tree(Context *c);
+  void freeze_tree_finish(Context *c);
+  void unfreeze_tree();
+
+  void freeze_dir(Context *c);
+  void freeze_dir_finish(Context *c);
+  void unfreeze_dir();
+
+  bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); }
+  bool is_freezing_tree();
+  bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; }
+  bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; }
+
+  bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); }
+  bool is_frozen_tree();
+  bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; }
+  bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; }
+  
+  bool is_freezeable() {
+    if (auth_pins == 0 && nested_auth_pins == 0) return true;
+    return false;
+  }
+  bool is_freezeable_dir() {
+    if (auth_pins == 0) return true;
+    return false;
+  }
+
+
+
+  // debuggin bs
+  void dump(int d = 0);
+};
+
+
+
+// -- encoded state --
+
+// discover
+
+class CDirDiscover {
+  inodeno_t ino;
+  int       nonce;
+  int       dir_auth;
+  int       dir_rep;
+  set<int>  rep_by;
+
+ public:
+  CDirDiscover() {}
+  CDirDiscover(CDir *dir, int nonce) {
+    ino = dir->ino();
+    this->nonce = nonce;
+    dir_auth = dir->dir_auth;
+    dir_rep = dir->dir_rep;
+    rep_by = dir->dir_rep_by;
+  }
+
+  void update_dir(CDir *dir) {
+    assert(dir->ino() == ino);
+    assert(!dir->is_auth());
+
+    dir->replica_nonce = nonce;
+    dir->dir_auth = dir_auth;
+    dir->dir_rep = dir_rep;
+    dir->dir_rep_by = rep_by;
+  }
+
+  inodeno_t get_ino() { return ino; }
+
+  
+  void _encode(bufferlist& bl) {
+    bl.append((char*)&ino, sizeof(ino));
+    bl.append((char*)&nonce, sizeof(nonce));
+    bl.append((char*)&dir_auth, sizeof(dir_auth));
+    bl.append((char*)&dir_rep, sizeof(dir_rep));
+    ::_encode(rep_by, bl);
+  }
+
+  void _decode(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    bl.copy(off, sizeof(nonce), (char*)&nonce);
+    off += sizeof(nonce);
+    bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
+    off += sizeof(dir_auth);
+    bl.copy(off, sizeof(dir_rep), (char*)&dir_rep);
+    off += sizeof(dir_rep);
+    ::_decode(rep_by, bl, off);
+  }
+
+};
+
+
+// export
+
+typedef struct {
+  inodeno_t      ino;
+  __uint64_t     nitems; // actual real entries
+  __uint64_t     nden;   // num dentries (including null ones)
+  version_t     version;
+  unsigned       state;
+  meta_load_t   popularity_justme;
+  meta_load_t   popularity_curdom;
+  int            dir_auth;
+  int            dir_rep;
+  int            nopen_by;
+  int            nrep_by;
+  // ints follow
+} CDirExport_st;
+
+class CDirExport {
+  CDirExport_st st;
+  set<int>     open_by;
+  map<int,int> open_by_nonce;
+  set<int>     rep_by;
+
+ public:
+  CDirExport() {}
+  CDirExport(CDir *dir) {
+    memset(&st, 0, sizeof(st));
+
+    st.ino = dir->ino();
+    st.nitems = dir->nitems;
+    st.nden = dir->items.size();
+    st.version = dir->version;
+    st.state = dir->state;
+    st.dir_auth = dir->dir_auth;
+    st.dir_rep = dir->dir_rep;
+
+    st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] );
+    st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] );
+    dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
+    dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+
+    rep_by = dir->dir_rep_by;
+    open_by = dir->open_by;
+    open_by_nonce = dir->open_by_nonce;
+  }
+
+  inodeno_t get_ino() { return st.ino; }
+  __uint64_t get_nden() { return st.nden; }
+
+  void update_dir(CDir *dir) {
+    assert(dir->ino() == st.ino);
+
+    //dir->nitems = st.nitems;
+    dir->version = st.version;
+    if (dir->state & CDIR_STATE_HASHED) 
+      dir->state |= CDIR_STATE_AUTH;         // just inherit auth flag when hashed
+    else
+      dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) |   // remember import flag, etc.
+        (st.state & CDIR_MASK_STATE_EXPORTED);
+    dir->dir_auth = st.dir_auth;
+    dir->dir_rep = st.dir_rep;
+
+    dir->popularity[MDS_POP_JUSTME] += st.popularity_justme;
+    dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
+    dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
+    dir->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+
+    dir->replica_nonce = 0;  // no longer defined
+
+    if (!dir->open_by.empty())
+      dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl;
+
+    dir->dir_rep_by = rep_by;
+    dir->open_by = open_by;
+    dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl;
+    dir->open_by_nonce = open_by_nonce;
+    if (!open_by.empty())
+      dir->get(CDIR_PIN_OPENED);
+    if (dir->is_dirty())
+      dir->get(CDIR_PIN_DIRTY);
+  }
+
+
+  void _encode(bufferlist& bl) {
+    st.nrep_by = rep_by.size();
+    st.nopen_by = open_by_nonce.size();
+    bl.append((char*)&st, sizeof(st));
+    
+    // open_by
+    for (map<int,int>::iterator it = open_by_nonce.begin();
+         it != open_by_nonce.end();
+         it++) {
+      int m = it->first;
+      bl.append((char*)&m, sizeof(int));
+      int n = it->second;
+      bl.append((char*)&n, sizeof(int));
+    }
+
+    // rep_by
+    for (set<int>::iterator it = rep_by.begin();
+         it != rep_by.end();
+         it++) {
+      int m = *it;
+      bl.append((char*)&m, sizeof(int));
+    }
+  }
+
+  int _decode(bufferlist& bl, int off = 0) {
+    bl.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+
+    // open_by
+    for (int i=0; i<st.nopen_by; i++) {
+      int m,n;
+      bl.copy(off, sizeof(int), (char*)&m);
+      off += sizeof(int);
+      bl.copy(off, sizeof(int), (char*)&n);
+      off += sizeof(int);
+      open_by.insert(m);
+      open_by_nonce.insert(pair<int,int>(m,n));
+    }
+    
+    // rep_by
+    for (int i=0; i<st.nrep_by; i++) {
+      int m;
+      bl.copy(off, sizeof(int), (char*)&m);
+      off += sizeof(int);
+      rep_by.insert(m);
+    }
+
+    return off;
+  }
+
+};
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc
new file mode 100644
index 0000000000000..1c24434e6baac
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CInode.cc
@@ -0,0 +1,495 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "AnchorTable.h"
+
+#include "common/Clock.h"
+
+#include <string>
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.inode(" << inode.ino << ") "
+
+
+int cinode_pins[CINODE_NUM_PINS];  // counts
+
+
+ostream& operator<<(ostream& out, CInode& in)
+{
+  string path;
+  in.make_path(path);
+  out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" ");
+  if (in.is_auth()) {
+    out << "auth";
+    if (in.is_cached_by_anyone()) {
+      //out << "+" << in.get_cached_by();
+      for (set<int>::iterator it = in.cached_by_begin();
+           it != in.cached_by_end();
+           it++) {
+        out << "+" << *it << "." << in.get_cached_by_nonce(*it);
+      }
+    }
+  } else {
+    out << "rep@" << in.authority();
+    //if (in.get_replica_nonce() > 1)
+      out << "." << in.get_replica_nonce();
+    assert(in.get_replica_nonce() >= 0);
+  }
+
+  if (in.is_symlink()) out << " symlink";
+
+  out << " v" << in.get_version();
+
+  out << " hard=" << in.hardlock;
+  out << " file=" << in.filelock;
+
+  if (in.is_pinned()) {
+    out << " |";
+    for(set<int>::iterator it = in.get_ref_set().begin();
+        it != in.get_ref_set().end();
+        it++)
+      if (*it < CINODE_NUM_PINS)
+        out << " " << cinode_pin_names[*it];
+      else
+        out << " " << *it;
+  }
+
+  // hack: spit out crap on which clients have caps
+  if (!in.get_client_caps().empty()) {
+    out << " caps={";
+    for (map<int,Capability>::iterator it = in.get_client_caps().begin();
+         it != in.get_client_caps().end();
+         it++) {
+      if (it != in.get_client_caps().begin()) out << ",";
+      out << it->first;
+    }
+    out << "}";
+  }
+  out << " " << &in;
+  out << "]";
+  return out;
+}
+
+
+// ====== CInode =======
+CInode::CInode(MDCache *c, bool auth) : LRUObject() {
+  mdcache = c;
+
+  ref = 0;
+  
+  parent = NULL;
+  
+  dir = NULL;     // CDir opened separately
+
+  auth_pins = 0;
+  nested_auth_pins = 0;
+  num_request_pins = 0;
+
+  state = 0;  
+
+  committing_version = committed_version = 0;
+
+  if (auth) state_set(CINODE_STATE_AUTH);
+}
+
+CInode::~CInode() {
+  if (dir) { delete dir; dir = 0; }
+}
+
+CDir *CInode::get_parent_dir()
+{
+  if (parent)
+    return parent->dir;
+  return NULL;
+}
+CInode *CInode::get_parent_inode() 
+{
+  if (parent) 
+    return parent->dir->inode;
+  return NULL;
+}
+
+bool CInode::dir_is_auth() {
+  if (dir)
+    return dir->is_auth();
+  else
+    return is_auth();
+}
+
+CDir *CInode::get_or_open_dir(MDS *mds)
+{
+  assert(is_dir());
+
+  if (dir) return dir;
+
+  // can't open a dir if we're frozen_dir, bc of hashing stuff.
+  assert(!is_frozen_dir());
+
+  // only auth can open dir alone.
+  assert(is_auth());
+  set_dir( new CDir(this, mds, true) );
+  dir->dir_auth = -1;
+  return dir;
+}
+
+CDir *CInode::set_dir(CDir *newdir)
+{
+  assert(dir == 0);
+  dir = newdir;
+  return dir;
+}
+
+void CInode::set_auth(bool a) 
+{
+  if (!is_dangling() && !is_root() && 
+      is_auth() != a) {
+    /*
+    CDir *dir = get_parent_dir();
+    if (is_auth() && !a) 
+      dir->nauthitems--;
+    else
+      dir->nauthitems++;
+    */
+  }
+  
+  if (a) state_set(CINODE_STATE_AUTH);
+  else state_clear(CINODE_STATE_AUTH);
+}
+
+
+
+void CInode::make_path(string& s)
+{
+  if (parent) {
+    parent->make_path(s);
+  } 
+  else if (is_root()) {
+    s = "";  // root
+  } 
+  else {
+    s = "(dangling)";  // dangling
+  }
+}
+
+void CInode::make_anchor_trace(vector<Anchor*>& trace)
+{
+  if (parent) {
+    parent->dir->inode->make_anchor_trace(trace);
+    
+    dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl;
+    trace.push_back( new Anchor(ino(), 
+                                parent->dir->inode->ino(),
+                                parent->name) );
+  }
+  else if (state_test(CINODE_STATE_DANGLING)) {
+    dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl;
+    string ref_dn;
+    trace.push_back( new Anchor(ino(),
+                                MDS_INO_INODEFILE_OFFSET+dangling_auth,
+                                ref_dn) );
+  }
+  else 
+    assert(is_root());
+}
+
+
+
+
+void CInode::mark_dirty() {
+  
+  dout(10) << "mark_dirty " << *this << endl;
+
+  if (!parent) {
+    dout(10) << " dangling, not marking dirty!" << endl;
+    return;
+  }
+
+  /*
+    NOTE: I may already be dirty, but this fn _still_ needs to be called so that
+    the directory is (perhaps newly) dirtied, and so that parent_dir_version is 
+    updated below.
+  */
+  
+  // only auth can get dirty.  "dirty" async data in replicas is relative to (say) filelock state, not dirty flag.
+  assert(is_auth());
+
+  // touch my private version
+  inode.version++;
+  if (!(state & CINODE_STATE_DIRTY)) {
+    state |= CINODE_STATE_DIRTY;
+    get(CINODE_PIN_DIRTY);
+  }
+  
+  // relative to parent dir:
+  if (parent) {
+    // dir is now dirty (if it wasn't already)
+    parent->dir->mark_dirty();
+    
+    // i now live in that (potentially newly dirty) version
+    parent_dir_version = parent->dir->get_version();
+  }
+}
+
+void CInode::mark_clean()
+{
+  dout(10) << " mark_clean " << *this << endl;
+  if (state & CINODE_STATE_DIRTY) {
+    state &= ~CINODE_STATE_DIRTY;
+    put(CINODE_PIN_DIRTY);
+  }
+}    
+
+// state 
+
+
+
+
+
+// new state encoders
+
+void CInode::encode_file_state(bufferlist& bl) 
+{
+  bl.append((char*)&inode.size, sizeof(inode.size));
+  bl.append((char*)&inode.mtime, sizeof(inode.mtime));
+  bl.append((char*)&inode.atime, sizeof(inode.atime));  // ??
+}
+
+void CInode::decode_file_state(bufferlist& r, int& off)
+{
+  r.copy(off, sizeof(inode.size), (char*)&inode.size);
+  off += sizeof(inode.size);
+  r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime);
+  off += sizeof(inode.mtime);
+  r.copy(off, sizeof(inode.atime), (char*)&inode.atime);
+  off += sizeof(inode.atime);
+}
+
+/* not used currently
+void CInode::decode_merge_file_state(crope& r, int& off)
+{
+  __uint64_t size;
+  r.copy(off, sizeof(size), (char*)&size);
+  off += sizeof(size);
+  if (size > inode.size) inode.size = size;
+
+  time_t t;
+  r.copy(off, sizeof(t), (char*)&t);
+  off += sizeof(t);
+  if (t > inode.mtime) inode.mtime = t;
+
+  r.copy(off, sizeof(t), (char*)&t);
+  off += sizeof(t);
+  if (t > inode.atime) inode.atime = t;
+}
+*/
+
+void CInode::encode_hard_state(bufferlist& r)
+{
+  r.append((char*)&inode.mode, sizeof(inode.mode));
+  r.append((char*)&inode.uid, sizeof(inode.uid));
+  r.append((char*)&inode.gid, sizeof(inode.gid));
+  r.append((char*)&inode.ctime, sizeof(inode.ctime));
+}
+
+void CInode::decode_hard_state(bufferlist& r, int& off)
+{
+  r.copy(off, sizeof(inode.mode), (char*)&inode.mode);
+  off += sizeof(inode.mode);
+  r.copy(off, sizeof(inode.uid), (char*)&inode.uid);
+  off += sizeof(inode.uid);
+  r.copy(off, sizeof(inode.gid), (char*)&inode.gid);
+  off += sizeof(inode.gid);
+  r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime);
+  off += sizeof(inode.ctime);
+}
+
+
+// old state encoders
+
+/*
+void CInode::encode_basic_state(bufferlist& r)
+{
+  // inode
+  r.append((char*)&inode, sizeof(inode));
+  ::_encode(cached_by, r);
+  ::_encode(cached_by_nonce, r);
+}
+ 
+void CInode::decode_basic_state(bufferlist& r, int& off)
+{
+  // inode
+  r.copy(0,sizeof(inode_t), (char*)&inode);
+  off += sizeof(inode_t);
+
+  bool empty = cached_by.empty();
+  ::_decode(cached_by, r, off);
+  ::_decode(cached_by_nonce, r, off);
+  if (!empty)
+    get(CINODE_PIN_CACHED);
+}
+*/
+
+
+// waiting
+
+bool CInode::is_frozen()
+{
+  if (parent && parent->dir->is_frozen())
+    return true;
+  return false;
+}
+
+bool CInode::is_frozen_dir()
+{
+  if (parent && parent->dir->is_frozen_dir())
+    return true;
+  return false;
+}
+
+bool CInode::is_freezing()
+{
+  if (parent && parent->dir->is_freezing())
+    return true;
+  return false;
+}
+
+bool CInode::waiting_for(int tag) 
+{
+  return waiting.count(tag) > 0;
+}
+
+void CInode::add_waiter(int tag, Context *c) {
+  // waiting on hierarchy?
+  if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) {  
+    parent->dir->add_waiter(tag, c);
+    return;
+  }
+  
+  // this inode.
+  if (waiting.size() == 0)
+    get(CINODE_PIN_WAITER);
+  waiting.insert(pair<int,Context*>(tag,c));
+  dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
+  
+}
+
+void CInode::take_waiting(int mask, list<Context*>& ls)
+{
+  if (waiting.empty()) return;
+  
+  multimap<int,Context*>::iterator it = waiting.begin();
+  while (it != waiting.end()) {
+    if (it->first & mask) {
+      ls.push_back(it->second);
+      dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+
+      waiting.erase(it++);
+    } else {
+      dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl;
+      it++;
+    }
+  }
+
+  if (waiting.empty())
+    put(CINODE_PIN_WAITER);
+}
+
+void CInode::finish_waiting(int mask, int result) 
+{
+  dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl;
+  
+  list<Context*> finished;
+  take_waiting(mask, finished);
+  finish_contexts(finished, result);
+}
+
+
+// auth_pins
+bool CInode::can_auth_pin() {
+  if (parent)
+    return parent->dir->can_auth_pin();
+  return true;
+}
+
+void CInode::auth_pin() {
+  if (auth_pins == 0)
+    get(CINODE_PIN_AUTHPIN);
+  auth_pins++;
+
+  dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
+  if (parent)
+    parent->dir->adjust_nested_auth_pins( 1 );
+}
+
+void CInode::auth_unpin() {
+  auth_pins--;
+  if (auth_pins == 0)
+    put(CINODE_PIN_AUTHPIN);
+
+  dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
+  assert(auth_pins >= 0);
+
+  if (parent)
+    parent->dir->adjust_nested_auth_pins( -1 );
+}
+
+
+
+// authority
+
+int CInode::authority() {
+  if (is_dangling()) 
+    return dangling_auth;   // explicit
+  if (is_root())
+    return 0;  // i am root
+  assert(parent);
+  return parent->dir->dentry_authority( parent->name );
+}
+
+
+CInodeDiscover* CInode::replicate_to( int rep )
+{
+  assert(is_auth());
+
+  // relax locks?
+  if (!is_cached_by_anyone())
+    replicate_relax_locks();
+  
+  // return the thinger
+  int nonce = cached_by_add( rep );
+  return new CInodeDiscover( this, nonce );
+}
+
+
+// debug crap -----------------------------
+
+void CInode::dump(int dep)
+{
+  string ind(dep, '\t');
+  //cout << ind << "[inode " << this << "]" << endl;
+  
+  if (dir)
+    dir->dump(dep);
+}
+
diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h
new file mode 100644
index 0000000000000..3d754ad9c4fbc
--- /dev/null
+++ b/branches/sage/cephmds2/mds/CInode.h
@@ -0,0 +1,757 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __CINODE_H
+#define __CINODE_H
+
+#include "config.h"
+#include "include/types.h"
+#include "include/lru.h"
+
+#include "CDentry.h"
+#include "Lock.h"
+#include "Capability.h"
+
+#include "mdstypes.h"
+
+#include <cassert>
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+#include <iostream>
+using namespace std;
+
+
+
+
+
+// pins for keeping an item in cache (and debugging)
+#define CINODE_PIN_DIR       0
+#define CINODE_PIN_CACHED    1
+#define CINODE_PIN_DIRTY     2   // must flush
+#define CINODE_PIN_PROXY     3   // can't expire yet
+#define CINODE_PIN_WAITER    4   // waiter
+
+#define CINODE_PIN_CAPS      5  // local fh's
+
+#define CINODE_PIN_DNDIRTY   7  // dentry is dirty
+
+#define CINODE_PIN_AUTHPIN   8
+#define CINODE_PIN_IMPORTING  9   // multipurpose, for importing
+#define CINODE_PIN_REQUEST   10  // request is logging, finishing
+#define CINODE_PIN_RENAMESRC 11  // pinned on dest for foreign rename
+#define CINODE_PIN_ANCHORING 12
+
+#define CINODE_PIN_OPENINGDIR 13
+
+#define CINODE_PIN_DENTRYLOCK   14
+
+#define CINODE_NUM_PINS       15
+
+static char *cinode_pin_names[CINODE_NUM_PINS] = {
+  "dir",
+  "cached",
+  "dirty",
+  "proxy",
+  "waiter",
+  "caps",
+  "--",
+  "dndirty",
+  "authpin",
+  "imping",
+  "request",
+  "rensrc",
+  "anching",
+  "opdir",
+  "dnlock"
+};
+
+
+
+
+
+
+// wait reasons
+#define CINODE_WAIT_AUTHPINNABLE  CDIR_WAIT_UNFREEZE
+    // waiters: write_hard_start, read_file_start, write_file_start  (mdcache)
+    //          handle_client_chmod, handle_client_touch             (mds)
+    // trigger: (see CDIR_WAIT_UNFREEZE)
+#define CINODE_WAIT_GETREPLICA    (1<<11)  // update/replicate individual inode
+    // waiters: import_dentry_inode
+    // trigger: handle_inode_replicate_ack
+
+#define CINODE_WAIT_DIR           (1<<13)
+    // waiters: traverse_path
+    // triggers: handle_disocver_reply
+
+#define CINODE_WAIT_LINK         (1<<14)  // as in remotely nlink++
+#define CINODE_WAIT_ANCHORED     (1<<15)
+#define CINODE_WAIT_UNLINK       (1<<16)  // as in remotely nlink--
+
+#define CINODE_WAIT_HARDR        (1<<17)  // 131072
+#define CINODE_WAIT_HARDW        (1<<18)  // 262...
+#define CINODE_WAIT_HARDB        (1<<19)
+#define CINODE_WAIT_HARDRWB      (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB)
+#define CINODE_WAIT_HARDSTABLE   (1<<20)
+#define CINODE_WAIT_HARDNORD     (1<<21)
+#define CINODE_WAIT_FILER        (1<<22)  
+#define CINODE_WAIT_FILEW        (1<<23)
+#define CINODE_WAIT_FILEB        (1<<24)
+#define CINODE_WAIT_FILERWB      (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB)
+#define CINODE_WAIT_FILESTABLE   (1<<25)
+#define CINODE_WAIT_FILENORD     (1<<26)
+#define CINODE_WAIT_FILENOWR     (1<<27)
+
+#define CINODE_WAIT_RENAMEACK       (1<<28)
+#define CINODE_WAIT_RENAMENOTIFYACK (1<<29)
+
+#define CINODE_WAIT_CAPS            (1<<30)
+
+
+
+
+#define CINODE_WAIT_ANY           0xffffffff
+
+
+// state
+#define CINODE_STATE_AUTH        (1<<0)
+#define CINODE_STATE_ROOT        (1<<1)
+
+#define CINODE_STATE_DIRTY       (1<<2)
+#define CINODE_STATE_UNSAFE      (1<<3)   // not logged yet
+#define CINODE_STATE_DANGLING    (1<<4)   // delete me when i expire; i have no dentry
+#define CINODE_STATE_UNLINKING   (1<<5)
+#define CINODE_STATE_PROXY       (1<<6)   // can't expire yet
+#define CINODE_STATE_EXPORTING   (1<<7)   // on nonauth bystander.
+
+#define CINODE_STATE_ANCHORING   (1<<8)
+
+#define CINODE_STATE_OPENINGDIR  (1<<9)
+
+//#define CINODE_STATE_RENAMING    (1<<8)  // moving me
+//#define CINODE_STATE_RENAMINGTO  (1<<9)  // rename target (will be unlinked)
+
+
+// misc
+#define CINODE_EXPORT_NONCE      1 // nonce given to replicas created by export
+#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME???
+
+class Context;
+class CDentry;
+class CDir;
+class MDS;
+class Message;
+class CInode;
+class CInodeDiscover;
+class MDCache;
+
+//class MInodeSyncStart;
+
+ostream& operator<<(ostream& out, CInode& in);
+
+
+extern int cinode_pins[CINODE_NUM_PINS];  // counts
+
+
+// cached inode wrapper
+class CInode : public LRUObject {
+ public:
+  MDCache *mdcache;
+
+  inode_t          inode;     // the inode itself
+
+  CDir            *dir;       // directory, if we have it opened.
+  string           symlink;   // symlink dest, if symlink
+
+  // inode metadata locks
+  CLock        hardlock;
+  CLock        filelock;
+
+ protected:
+  int              ref;       // reference count
+  set<int>         ref_set;
+  version_t        parent_dir_version;  // parent dir version when i was last touched.
+  version_t        committing_version;
+  version_t        committed_version;
+
+  unsigned         state;
+
+  // parent dentries in cache
+  CDentry         *parent;             // primary link
+  set<CDentry*>    remote_parents;     // if hard linked
+
+  // -- distributed caching
+  set<int>         cached_by;        // [auth] mds's that cache me.  
+  /* NOTE: on replicas, this doubles as replicated_by, but the
+     cached_by_* access methods below should NOT be used in those
+     cases, as the semantics are different! */
+  map<int,int>     cached_by_nonce;  // [auth] nonce issued to each replica
+  int              replica_nonce;    // [replica] defined on replica
+
+  int              dangling_auth;    // explicit auth, when dangling.
+
+  int              num_request_pins;
+
+  // waiters
+  multimap<int, Context*>  waiting;
+
+  // file capabilities
+  map<int, Capability>  client_caps;         // client -> caps
+
+  map<int, int>         mds_caps_wanted;     // [auth] mds -> caps wanted
+  int                   replica_caps_wanted; // [replica] what i've requested from auth
+  utime_t               replica_caps_wanted_keep_until;
+
+
+ private:
+  // lock nesting
+  int auth_pins;
+  int nested_auth_pins;
+
+ public:
+  meta_load_t popularity[MDS_NPOP];
+
+  // friends
+  friend class Server;
+  friend class Locker;
+  friend class Migrator;
+  friend class MDCache;
+  friend class CDir;
+  friend class CInodeExport;
+  friend class CInodeDiscover;
+
+ public:
+  // ---------------------------
+  CInode(MDCache *c, bool auth=true);
+  ~CInode();
+  
+
+  // -- accessors --
+  bool is_file()    { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE)    ? true:false; }
+  bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; }
+  bool is_dir()     { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR)     ? true:false; }
+
+  bool is_anchored() { return inode.anchored; }
+
+  bool is_root() { return state & CINODE_STATE_ROOT; }
+  bool is_proxy() { return state & CINODE_STATE_PROXY; }
+
+  bool is_auth() { return state & CINODE_STATE_AUTH; }
+  void set_auth(bool auth);
+  bool is_replica() { return !is_auth(); }
+  int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
+
+  inodeno_t ino() { return inode.ino; }
+  inode_t& get_inode() { return inode; }
+  CDentry* get_parent_dn() { return parent; }
+  CDir *get_parent_dir();
+  CInode *get_parent_inode();
+  CInode *get_realm_root();   // import, hash, or root
+  
+  CDir *get_or_open_dir(MDS *mds);
+  CDir *set_dir(CDir *newdir);
+  
+  bool dir_is_auth();
+
+
+
+  // -- misc -- 
+  void make_path(string& s);
+  void make_anchor_trace(vector<class Anchor*>& trace);
+
+
+
+  // -- state --
+  unsigned get_state() { return state; }
+  void state_clear(unsigned mask) {    state &= ~mask; }
+  void state_set(unsigned mask) { state |= mask; }
+  unsigned state_test(unsigned mask) { return state & mask; }
+
+  bool is_unsafe() { return state & CINODE_STATE_UNSAFE; }
+  bool is_dangling() { return state & CINODE_STATE_DANGLING; }
+  bool is_unlinking() { return state & CINODE_STATE_UNLINKING; }
+
+  void mark_unsafe() { state |= CINODE_STATE_UNSAFE; }
+  void mark_safe() { state &= ~CINODE_STATE_UNSAFE; }
+
+  // -- state encoding --
+  //void encode_basic_state(bufferlist& r);
+  //void decode_basic_state(bufferlist& r, int& off);
+
+
+  void encode_file_state(bufferlist& r);
+  void decode_file_state(bufferlist& r, int& off);
+
+  void encode_hard_state(bufferlist& r);
+  void decode_hard_state(bufferlist& r, int& off);
+
+  
+  // -- dirtyness --
+  version_t get_version() { return inode.version; }
+  version_t get_parent_dir_version() { return parent_dir_version; }
+  void float_parent_dir_version(version_t ge) {
+    if (parent_dir_version < ge)
+      parent_dir_version = ge;
+  }
+  version_t get_committing_version() { return committing_version; }
+  version_t get_last_committed_version() { return committed_version; }
+  void set_committing_version(version_t v) { committing_version = v; }
+  void set_committed_version() { 
+    committed_version = committing_version;
+    committing_version = 0;
+  }
+
+  bool is_dirty() { return state & CINODE_STATE_DIRTY; }
+  bool is_clean() { return !is_dirty(); }
+  
+  void mark_dirty();
+  void mark_clean();
+
+
+
+  // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy
+  bool is_cached_by_anyone() { return !cached_by.empty(); }
+  bool is_cached_by(int mds) { return cached_by.count(mds); }
+  int num_cached_by() { return cached_by.size(); }
+  // cached_by_add returns a nonce
+  int cached_by_add(int mds) {
+    int nonce = 1;
+    if (is_cached_by(mds)) {    // already had it?
+      nonce = get_cached_by_nonce(mds) + 1;   // new nonce (+1)
+      dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
+      cached_by_nonce.erase(mds);
+    } else {
+      if (cached_by.empty()) 
+        get(CINODE_PIN_CACHED);
+      cached_by.insert(mds);
+    }
+    cached_by_nonce.insert(pair<int,int>(mds,nonce));   // first! serial of 1.
+    return nonce;   // default nonce
+  }
+  void cached_by_add(int mds, int nonce) {
+    if (cached_by.empty()) 
+      get(CINODE_PIN_CACHED);
+    cached_by.insert(mds);
+    cached_by_nonce.insert(pair<int,int>(mds,nonce));
+  }
+  int get_cached_by_nonce(int mds) {
+    map<int,int>::iterator it = cached_by_nonce.find(mds);
+    return it->second;
+  }
+  void cached_by_remove(int mds) {
+    //if (!is_cached_by(mds)) return;
+    assert(is_cached_by(mds));
+
+    cached_by.erase(mds);
+    cached_by_nonce.erase(mds);
+    if (cached_by.empty())
+      put(CINODE_PIN_CACHED);      
+  }
+  void cached_by_clear() {
+    if (cached_by.size())
+      put(CINODE_PIN_CACHED);
+    cached_by.clear();
+    cached_by_nonce.clear();
+  }
+  set<int>::iterator cached_by_begin() { return cached_by.begin(); }
+  set<int>::iterator cached_by_end() { return cached_by.end(); }
+  set<int>& get_cached_by() { return cached_by; }
+
+  CInodeDiscover* replicate_to(int rep);
+
+
+  // -- waiting --
+  bool waiting_for(int tag);
+  void add_waiter(int tag, Context *c);
+  void take_waiting(int tag, list<Context*>& ls);
+  void finish_waiting(int mask, int result = 0);
+
+
+  // -- caps -- (new)
+  // client caps
+  map<int,Capability>& get_client_caps() { return client_caps; }
+  void add_client_cap(int client, Capability& cap) {
+    if (client_caps.empty())
+      get(CINODE_PIN_CAPS);
+    assert(client_caps.count(client) == 0);
+    client_caps[client] = cap;
+  }
+  void remove_client_cap(int client) {
+    assert(client_caps.count(client) == 1);
+    client_caps.erase(client);
+    if (client_caps.empty())
+      put(CINODE_PIN_CAPS);
+  }
+  Capability* get_client_cap(int client) {
+    if (client_caps.count(client))
+      return &client_caps[client];
+    return 0;
+  }
+  /*
+  void set_client_caps(map<int,Capability>& cl) {
+    if (client_caps.empty() && !cl.empty())
+      get(CINODE_PIN_CAPS);
+    client_caps.clear();
+    client_caps = cl;
+  }
+  */
+  void take_client_caps(map<int,Capability>& cl) {
+    if (!client_caps.empty())
+      put(CINODE_PIN_CAPS);
+    cl = client_caps;
+    client_caps.clear();
+  }
+  void merge_client_caps(map<int,Capability>& cl, set<int>& new_client_caps) {
+    if (client_caps.empty() && !cl.empty())
+      get(CINODE_PIN_CAPS);
+    for (map<int,Capability>::iterator it = cl.begin();
+         it != cl.end();
+         it++) {
+      new_client_caps.insert(it->first);
+      if (client_caps.count(it->first)) {
+        // merge
+        client_caps[it->first].merge(it->second);
+      } else {
+        // new
+        client_caps[it->first] = it->second;
+      }
+    }      
+  }
+
+  // caps issued, wanted
+  int get_caps_issued() {
+    int c = 0;
+    for (map<int,Capability>::iterator it = client_caps.begin();
+         it != client_caps.end();
+         it++) 
+      c |= it->second.issued();
+    return c;
+  }
+  int get_caps_wanted() {
+    int w = 0;
+    for (map<int,Capability>::iterator it = client_caps.begin();
+         it != client_caps.end();
+         it++) {
+      w |= it->second.wanted();
+      //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
+    }
+    if (is_auth())
+      for (map<int,int>::iterator it = mds_caps_wanted.begin();
+           it != mds_caps_wanted.end();
+           it++) {
+        w |= it->second;
+        //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
+      }
+    return w;
+  }
+
+
+  void replicate_relax_locks() {
+    assert(is_auth());
+    assert(!is_cached_by_anyone());
+    dout(10) << " relaxing locks on " << *this << endl;
+
+    if (hardlock.get_state() == LOCK_LOCK &&
+        !hardlock.is_used()) {
+      dout(10) << " hard now sync " << *this << endl;
+      hardlock.set_state(LOCK_SYNC);
+    }
+    if (filelock.get_state() == LOCK_LOCK) {
+      if (!filelock.is_used() &&
+          (get_caps_issued() & CAP_FILE_WR) == 0) {
+        filelock.set_state(LOCK_SYNC);
+        dout(10) << " file now sync " << *this << endl;
+      } else {
+        dout(10) << " can't relax filelock on " << *this << endl;
+      }
+    }
+  }
+
+
+  // -- authority --
+  int authority();
+
+
+  // -- auth pins --
+  int is_auth_pinned() { 
+    return auth_pins;
+  }
+  int adjust_nested_auth_pins(int a);
+  bool can_auth_pin();
+  void auth_pin();
+  void auth_unpin();
+
+
+  // -- freeze --
+  bool is_frozen();
+  bool is_frozen_dir();
+  bool is_freezing();
+
+
+  // -- reference counting --
+  
+  /* these can be pinned any # of times, and are
+     linked to an active_request, so they're automatically cleaned
+     up when a request is finished.  pin at will! */
+  void request_pin_get() {
+    if (num_request_pins == 0) get(CINODE_PIN_REQUEST);
+    num_request_pins++;
+  }
+  void request_pin_put() {
+    num_request_pins--;
+    if (num_request_pins == 0) put(CINODE_PIN_REQUEST);
+    assert(num_request_pins >= 0);
+  }
+
+
+  bool is_pinned() { return ref > 0; }
+  set<int>& get_ref_set() { return ref_set; }
+  void put(int by) {
+    cinode_pins[by]--;
+    if (ref == 0 || ref_set.count(by) != 1) {
+      dout(7) << " bad put " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+      assert(ref_set.count(by) == 1);
+      assert(ref > 0);
+    }
+    ref--;
+    ref_set.erase(by);
+    if (ref == 0)
+      lru_unpin();
+    dout(7) << " put " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+  }
+  void get(int by) {
+    cinode_pins[by]++;
+    if (ref == 0)
+      lru_pin();
+    if (ref_set.count(by)) {
+      dout(7) << " bad get " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+      assert(ref_set.count(by) == 0);
+    }
+    ref++;
+    ref_set.insert(by);
+    dout(7) << " get " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+  }
+  bool is_pinned_by(int by) {
+    return ref_set.count(by);
+  }
+
+  // -- hierarchy stuff --
+  void set_primary_parent(CDentry *p) {
+    parent = p;
+  }
+  void remove_primary_parent(CDentry *dn) {
+    assert(dn == parent);
+    parent = 0;
+  }
+  void add_remote_parent(CDentry *p) {
+    remote_parents.insert(p);
+  }
+  void remove_remote_parent(CDentry *p) {
+    remote_parents.erase(p);
+  }
+  int num_remote_parents() {
+    return remote_parents.size(); 
+  }
+
+
+  /*
+  // for giving to clients
+  void get_dist_spec(set<int>& ls, int auth, timepair_t& now) {
+    if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) ||
+        (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) {
+      //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
+      ls = cached_by;
+    }
+  }
+  */
+
+  // dbg
+  void dump(int d = 0);
+};
+
+
+
+
+// -- encoded state
+
+// discover
+
+class CInodeDiscover {
+  
+  inode_t    inode;
+  int        replica_nonce;
+  
+  int        hardlock_state;
+  int        filelock_state;
+
+ public:
+  CInodeDiscover() {}
+  CInodeDiscover(CInode *in, int nonce) {
+    inode = in->inode;
+    replica_nonce = nonce;
+
+    hardlock_state = in->hardlock.get_replica_state();
+    filelock_state = in->filelock.get_replica_state();
+  }
+
+  inodeno_t get_ino() { return inode.ino; }
+  int get_replica_nonce() { return replica_nonce; }
+
+  void update_inode(CInode *in) {
+    in->inode = inode;
+
+    in->replica_nonce = replica_nonce;
+    in->hardlock.set_state(hardlock_state);
+    in->filelock.set_state(filelock_state);
+  }
+  
+  void _encode(bufferlist& bl) {
+    bl.append((char*)&inode, sizeof(inode));
+    bl.append((char*)&replica_nonce, sizeof(replica_nonce));
+    bl.append((char*)&hardlock_state, sizeof(hardlock_state));
+    bl.append((char*)&filelock_state, sizeof(filelock_state));
+  }
+
+  void _decode(bufferlist& bl, int& off) {
+    bl.copy(off,sizeof(inode_t), (char*)&inode);
+    off += sizeof(inode_t);
+    bl.copy(off, sizeof(int), (char*)&replica_nonce);
+    off += sizeof(int);
+    bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state);
+    off += sizeof(hardlock_state);
+    bl.copy(off, sizeof(filelock_state), (char*)&filelock_state);
+    off += sizeof(filelock_state);
+  }  
+
+};
+
+
+// export
+
+class CInodeExport {
+
+  struct {
+    inode_t        inode;
+    meta_load_t    popularity_justme;
+    meta_load_t    popularity_curdom;
+    bool           is_dirty;       // dirty inode?
+    
+    int            num_caps;
+  } st;
+
+  set<int>      cached_by;
+  map<int,int>  cached_by_nonce;
+  map<int,Capability>  cap_map;
+
+  CLock         hardlock,filelock;
+  //int           remaining_issued;
+
+public:
+  CInodeExport() {}
+  CInodeExport(CInode *in) {
+    st.inode = in->inode;
+    st.is_dirty = in->is_dirty();
+    cached_by = in->cached_by;
+    cached_by_nonce = in->cached_by_nonce; 
+
+    hardlock = in->hardlock;
+    filelock = in->filelock;
+
+    st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] );
+    st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] );
+    in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
+    in->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+    
+    // steal WRITER caps from inode
+    in->take_client_caps(cap_map);
+    //remaining_issued = in->get_caps_issued();
+  }
+  ~CInodeExport() {
+  }
+  
+  inodeno_t get_ino() { return st.inode.ino; }
+
+  void update_inode(CInode *in, set<int>& new_client_caps) {
+    in->inode = st.inode;
+
+    in->popularity[MDS_POP_JUSTME] += st.popularity_justme;
+    in->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
+    in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
+    in->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+
+    if (st.is_dirty) {
+      in->mark_dirty();
+    }
+
+    in->cached_by.clear();
+    in->cached_by = cached_by;
+    in->cached_by_nonce = cached_by_nonce;
+    if (!cached_by.empty()) 
+      in->get(CINODE_PIN_CACHED);
+
+    in->hardlock = hardlock;
+    in->filelock = filelock;
+
+    // caps
+    in->merge_client_caps(cap_map, new_client_caps);
+  }
+
+  void _encode(bufferlist& bl) {
+    st.num_caps = cap_map.size();
+    bl.append((char*)&st, sizeof(st));
+    
+    // cached_by + nonce
+    ::_encode(cached_by, bl);
+    ::_encode(cached_by_nonce, bl);
+
+    hardlock.encode_state(bl);
+    filelock.encode_state(bl);
+
+    // caps
+    for (map<int,Capability>::iterator it = cap_map.begin();
+         it != cap_map.end();
+         it++) {
+      bl.append((char*)&it->first, sizeof(it->first));
+      it->second._encode(bl);
+    }
+  }
+
+  int _decode(bufferlist& bl, int off = 0) {
+    bl.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+    
+    ::_decode(cached_by, bl, off);
+    ::_decode(cached_by_nonce, bl, off);
+
+    hardlock.decode_state(bl, off);
+    filelock.decode_state(bl, off);
+
+    // caps
+    for (int i=0; i<st.num_caps; i++) {
+      int c;
+      bl.copy(off, sizeof(c), (char*)&c);
+      off += sizeof(c);
+      cap_map[c]._decode(bl, off);
+    }
+
+    return off;
+  }
+};
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/Capability.h b/branches/sage/cephmds2/mds/Capability.h
new file mode 100644
index 0000000000000..e011dbe43e88f
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Capability.h
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __CAPABILITY_H
+#define __CAPABILITY_H
+
+#include "include/buffer.h"
+
+#include <map>
+using namespace std;
+
+#include "config.h"
+
+
+// definite caps
+#define CAP_FILE_RDCACHE   1    // client can safely cache reads
+#define CAP_FILE_RD        2    // client can read
+#define CAP_FILE_WR        4    // client can write
+#define CAP_FILE_WREXTEND  8    // client can extend file
+#define CAP_FILE_WRBUFFER  16   // client can safely buffer writes
+#define CAP_FILE_LAZYIO    32   // client can perform lazy io
+
+
+// heuristics
+//#define CAP_FILE_DELAYFLUSH  32
+
+inline string cap_string(int cap)
+{
+  string s;
+  s = "[";
+  if (cap & CAP_FILE_RDCACHE) s += " rdcache";
+  if (cap & CAP_FILE_RD) s += " rd";
+  if (cap & CAP_FILE_WR) s += " wr";
+  if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer";
+  if (cap & CAP_FILE_WRBUFFER) s += " wrextend";
+  if (cap & CAP_FILE_LAZYIO) s += " lazyio";
+  s += " ]";
+  return s;
+}
+
+
+class Capability {
+  int wanted_caps;     // what the client wants (ideally)
+
+  map<long, int>  cap_history;  // seq -> cap
+  long last_sent, last_recv;
+    
+  bool suppress;
+
+public:
+  Capability(int want=0) :
+    wanted_caps(want),
+    last_sent(0),
+    last_recv(0),
+    suppress(false) { 
+    //cap_history[last_sent] = 0;
+  }
+
+  
+  bool is_suppress() { return suppress; }
+  void set_suppress(bool b) { suppress = b; }
+
+  bool is_null() { return cap_history.empty(); }
+
+  // most recently issued caps.
+  int pending()   { 
+    if (cap_history.count(last_sent))
+      return cap_history[ last_sent ];
+    return 0;
+  }
+  
+  // caps client has confirmed receipt of
+  int confirmed() { 
+    if (cap_history.count(last_recv))
+      return cap_history[ last_recv ];
+    return 0;
+  }
+
+  // caps potentially issued
+  int issued() { 
+    int c = 0;
+    for (long seq = last_recv; seq <= last_sent; seq++) {
+      if (cap_history.count(seq)) {
+        c |= cap_history[seq];
+        dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl;
+      }
+    }
+    return c;
+  }
+
+  // caps this client wants to hold
+  int wanted() { return wanted_caps; }
+  void set_wanted(int w) {
+    wanted_caps = w;
+  }
+
+  // needed
+  static int needed(int from) {
+    // strip out wrbuffer, rdcache
+    return from & (CAP_FILE_WR|CAP_FILE_RD);
+  }
+  int needed() { return needed(wanted_caps); }
+
+  // conflicts
+  static int conflicts(int from) {
+    int c = 0;
+    if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD;
+    if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE;
+    if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER;
+    if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR;
+    return c;
+  }
+  int wanted_conflicts() { return conflicts(wanted()); }
+  int needed_conflicts() { return conflicts(needed()); }
+  int issued_conflicts() { return conflicts(issued()); }
+
+  // issue caps; return seq number.
+  long issue(int c) {
+    //int was = pending();
+    //no!  if (c == was && last_sent) return -1;  // repeat of previous?
+    
+    ++last_sent;
+    cap_history[last_sent] = c;
+
+    /* no!
+    // not recalling, just adding?
+    if (c & ~was &&
+        cap_history.count(last_sent-1)) { 
+      cap_history.erase(last_sent-1);
+    }
+    */
+    return last_sent;
+  }
+  long get_last_seq() { return last_sent; }
+
+  void merge(Capability& other) {
+    // issued + pending
+    int newpending = other.pending() | pending();
+    if (other.issued() & ~newpending)
+      issue(other.issued() | newpending);
+    issue(newpending);
+
+    // wanted
+    wanted_caps = wanted_caps | other.wanted();
+  }
+
+  // confirm receipt of a previous sent/issued seq.
+  int confirm_receipt(long seq, int caps) {
+    int r = 0;
+
+    // old seqs
+    while (last_recv < seq) {
+      dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl;
+      r |= cap_history[last_recv];
+      cap_history.erase(last_recv);
+      ++last_recv;
+    }
+    
+    // release current?
+    if (cap_history.count(seq) &&
+        cap_history[seq] != caps) {
+      dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl;
+      // note what we're releasing..
+      assert(cap_history[seq] & ~caps);
+      r |= cap_history[seq] & ~caps; 
+
+      cap_history[seq] = caps; // confirmed() now less than before..
+    }
+
+    // null?
+    if (caps == 0 && 
+        cap_history.size() == 1 &&
+        cap_history.count(seq)) {
+      cap_history.clear();  // viola, null!
+    }
+
+    return r;
+  }
+
+  // serializers
+  void _encode(bufferlist& bl) {
+    bl.append((char*)&wanted_caps, sizeof(wanted_caps));
+    bl.append((char*)&last_sent, sizeof(last_sent));
+    bl.append((char*)&last_recv, sizeof(last_recv));
+    ::_encode(cap_history, bl);
+  }
+  void _decode(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps);
+    off += sizeof(wanted_caps);
+    bl.copy(off, sizeof(last_sent), (char*)&last_sent);
+    off += sizeof(last_sent);
+    bl.copy(off, sizeof(last_recv), (char*)&last_recv);
+    off += sizeof(last_recv);
+    ::_decode(cap_history, bl, off);
+  }
+  
+};
+
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/ClientMap.h b/branches/sage/cephmds2/mds/ClientMap.h
new file mode 100644
index 0000000000000..63f310358cae8
--- /dev/null
+++ b/branches/sage/cephmds2/mds/ClientMap.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CLIENTMAP_H
+#define __CLIENTMAP_H
+
+#include "msg/Message.h"
+
+#include <set>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+class ClientMap {
+  hash_map<int,entity_inst_t> client_inst;
+  set<int>           client_mount;
+  hash_map<int, int> client_ref;
+  
+  void inc_ref(int client, const entity_inst_t& inst) {
+    if (client_inst.count(client)) {
+      assert(client_inst[client] == inst);
+      assert(client_ref.count(client));
+    } else {
+      client_inst[client] = inst;
+    }
+    client_ref[client]++;
+  }
+  void dec_ref(int client) {
+    assert(client_ref.count(client));
+    assert(client_ref[client] > 0);
+    client_ref[client]--;
+    if (client_ref[client] == 0) {
+      client_ref.erase(client);
+      client_inst.erase(client);
+    }
+  }
+  
+public:
+  const entity_inst_t& get_inst(int client) {
+    assert(client_inst.count(client));
+    return client_inst[client];
+  }
+  const set<int>& get_mount_set() { return client_mount; }
+  
+  void add_mount(int client, const entity_inst_t& inst) {
+    inc_ref(client, inst);
+    client_mount.insert(client);
+  }
+  void rem_mount(int client) {
+    dec_ref(client);
+    client_mount.erase(client);
+  }
+  
+  
+  void add_open(int client, const entity_inst_t& inst) {
+    inc_ref(client, inst);
+  }
+  void dec_open(int client) {
+    dec_ref(client);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/IdAllocator.cc b/branches/sage/cephmds2/mds/IdAllocator.cc
new file mode 100644
index 0000000000000..fba33d599de40
--- /dev/null
+++ b/branches/sage/cephmds2/mds/IdAllocator.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#define DBLEVEL  20
+
+#include "IdAllocator.h"
+#include "MDS.h"
+#include "MDLog.h"
+#include "events/EAlloc.h"
+
+#include "osdc/Filer.h"
+
+#include "include/types.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: "
+
+
+idno_t IdAllocator::alloc_id(bool replay) 
+{
+  assert(is_active());
+  
+  // pick one
+  idno_t id = free.start();
+  free.erase(id);
+  dout(10) << "idalloc " << this << ": alloc id " << id << endl;
+
+  version++;
+  
+  // log it
+  if (!replay)
+    mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version));
+  
+  return id;
+}
+
+void IdAllocator::reclaim_id(idno_t id, bool replay) 
+{
+  assert(is_active());
+  
+  dout(10) << "idalloc " << this << ": reclaim id " << id << endl;
+  free.insert(id);
+
+  version++;
+  
+  if (!replay)
+    mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version));
+}
+
+
+
+class C_ID_Save : public Context {
+  IdAllocator *ida;
+  version_t version;
+public:
+  C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {}
+  void finish(int r) {
+    ida->save_2(version);
+  }
+};
+
+void IdAllocator::save(Context *onfinish, version_t v)
+{
+  if (v > 0 && v <= committing_version) {
+    dout(10) << "save v " << version << " - already saving "
+	     << committing_version << " >= needed " << v << endl;
+    waitfor_save[v].push_back(onfinish);
+    return;
+  }
+  
+  dout(10) << "save v " << version << endl;
+  assert(is_active());
+  
+  bufferlist bl;
+
+  bl.append((char*)&version, sizeof(version));
+  ::_encode(free.m, bl);
+
+  committing_version = version;
+
+  if (onfinish)
+    waitfor_save[version].push_back(onfinish);
+
+  // write (async)
+  mds->filer->write(inode,
+                    0, bl.length(), bl,
+                    0,
+		    0, new C_ID_Save(this, version));
+}
+
+void IdAllocator::save_2(version_t v)
+{
+  dout(10) << "save_2 v " << v << endl;
+  
+  committed_version = v;
+  
+  list<Context*> ls;
+  while (!waitfor_save.empty()) {
+    if (waitfor_save.begin()->first > v) break;
+    ls.splice(ls.end(), waitfor_save.begin()->second);
+    waitfor_save.erase(waitfor_save.begin());
+  }
+  finish_contexts(ls,0);
+}
+
+
+void IdAllocator::reset()
+{
+  free.clear();
+
+  // use generic range FIXME THIS IS CRAP
+  free.insert((long long)0x1000000 * (long long)(mds->get_nodeid()+1),
+	      (long long)0x1000000 * (long long)(mds->get_nodeid()+2) - 1LL);
+  //free[ID_INO].dump();
+  
+  //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1),
+  //10000000LL * (mds->get_nodeid()+2) - 1);
+  //free[ID_FH].dump();
+
+  state = STATE_ACTIVE;
+}
+
+
+
+// -----------------------
+
+class C_ID_Load : public Context {
+public:
+  IdAllocator *ida;
+  Context *onfinish;
+  bufferlist bl;
+  C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {}
+  void finish(int r) {
+    ida->load_2(r, bl, onfinish);
+  }
+};
+
+void IdAllocator::load(Context *onfinish)
+{ 
+  dout(10) << "load" << endl;
+
+  assert(is_undef());
+  state = STATE_OPENING;
+
+  C_ID_Load *c = new C_ID_Load(this, onfinish);
+  mds->filer->read(inode,
+                   0, inode.layout.stripe_size,
+                   &c->bl,
+                   c);
+}
+
+void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish)
+{
+  assert(is_opening());
+  state = STATE_ACTIVE;
+
+  if (r > 0) {
+    dout(10) << "load_2 got " << bl.length() << " bytes" << endl;
+    int off = 0;
+    bl.copy(off, sizeof(version), (char*)&version);
+    off += sizeof(version);
+    ::_decode(free.m, bl, off);
+    committed_version = version;
+  }
+  else {
+    dout(10) << "load_2 found no alloc file" << endl;
+    assert(0); // this shouldn't happen if mkfs finished.
+    reset();   
+  }
+
+  if (onfinish) {
+    onfinish->finish(0);
+    delete onfinish;
+  }
+}
diff --git a/branches/sage/cephmds2/mds/IdAllocator.h b/branches/sage/cephmds2/mds/IdAllocator.h
new file mode 100644
index 0000000000000..745d863be99d3
--- /dev/null
+++ b/branches/sage/cephmds2/mds/IdAllocator.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __IDALLOCATOR_H
+#define __IDALLOCATOR_H
+
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+
+class MDS;
+
+#define IDTYPE_INO 1
+typedef inodeno_t idno_t;
+
+class IdAllocator {
+  MDS *mds;
+  inode_t inode;
+
+  static const int STATE_UNDEF   = 0;
+  static const int STATE_OPENING = 1;
+  static const int STATE_ACTIVE  = 2;
+  //static const int STATE_COMMITTING = 3;
+  int state;
+
+  version_t version, committing_version, committed_version;
+
+  interval_set<idno_t> free;   // unused ids
+  
+  map<version_t, list<Context*> > waitfor_save;
+
+ public:
+  IdAllocator(MDS *m, inode_t i) :
+    mds(m),
+    inode(i),
+    state(STATE_UNDEF),
+    version(0), committing_version(0), committed_version(0)
+  {
+  }
+
+  // alloc or reclaim ids
+  idno_t alloc_id(bool replay=false);
+  void reclaim_id(idno_t id, bool replay=false);
+
+  version_t get_version() { return version; }
+  version_t get_committed_version() { return committed_version; }
+
+  // load/save from disk (hack)
+  bool is_undef() { return state == STATE_UNDEF; }
+  bool is_active() { return state == STATE_ACTIVE; }
+  bool is_opening() { return state == STATE_OPENING; }
+
+  void reset();
+  void save(Context *onfinish=0, version_t need=0);
+  void save_2(version_t v);
+
+  void shutdown() {
+    if (is_active()) save(0);
+  }
+
+  void load(Context *onfinish);
+  void load_2(int, bufferlist&, Context *onfinish);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/Lock.h b/branches/sage/cephmds2/mds/Lock.h
new file mode 100644
index 0000000000000..faf648ed3b07f
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Lock.h
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __LOCK_H
+#define __LOCK_H
+
+#include <assert.h>
+#include <set>
+using namespace std;
+
+#include "include/buffer.h"
+
+#include "Capability.h"
+
+// states and such.
+//  C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio
+
+// basic lock                    -----auth--------   ---replica-------
+#define LOCK_SYNC     0  // AR   R . / C R . . . L   R . / C R . . . L   stat()
+#define LOCK_LOCK     1  // AR   R W / C . . . . .   . . / C . . . . .   truncate()
+#define LOCK_GLOCKR   2  // AR   R . / C . . . . .   . . / C . . . . .
+
+// file lock states
+#define LOCK_GLOCKL   3  // A    . . / . . . . . .                       loner -> lock
+#define LOCK_GLOCKM   4  // A    . . / . . . . . .
+#define LOCK_MIXED    5  // AR   . . / . R W A . L   . . / . R . . . L
+#define LOCK_GMIXEDR  6  // AR   R . / . R . . . L   . . / . R . . . L 
+#define LOCK_GMIXEDL  7  // A    . . / . . . . . L                       loner -> mixed
+
+#define LOCK_LONER    8  // A    . . / C R W A B L        (lock)      
+#define LOCK_GLONERR  9  // A    . . / . R . . . L
+#define LOCK_GLONERM  10 // A    . . / . R W A . L
+
+#define LOCK_GSYNCL   11 // A    . . / C ? . . . L                       loner -> sync    (*) FIXME: let old loner keep R, somehow...
+#define LOCK_GSYNCM   12 // A    . . / . R . . . L
+
+//   4 stable
+//  +9 transition
+//  13 total
+
+/* no append scenarios:
+
+loner + truncate():
+  - loner needs to lose A (?unless it's the loner doing the truncate?)
+loner + statlite(size):
+  - loner needs to lose A
+
+any + statlite(size)
+  - all lose A
+
+any + statlite(mtime)
+  - all lose W
+
+-> we need to add lonerfixed and mixedfixed states (and associated transitions)
+ in order to efficiently support statlite(size) and truncate().  until then,
+ we have to LOCK.
+
+ */
+
+// -- lock... hard or file
+
+class CLock {
+ protected:
+  // lock state
+  char     state;
+  set<int> gather_set;  // auth
+  int      nread, nwrite;
+
+  
+ public:
+  CLock() : 
+    state(LOCK_LOCK), 
+    nread(0), 
+    nwrite(0) {
+  }
+  
+  // encode/decode
+  void encode_state(bufferlist& bl) {
+    bl.append((char*)&state, sizeof(state));
+    bl.append((char*)&nread, sizeof(nread));
+    bl.append((char*)&nwrite, sizeof(nwrite));
+
+    _encode(gather_set, bl);
+  }
+  void decode_state(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(state), (char*)&state);
+    off += sizeof(state);
+    bl.copy(off, sizeof(nread), (char*)&nread);
+    off += sizeof(nread);
+    bl.copy(off, sizeof(nwrite), (char*)&nwrite);
+    off += sizeof(nwrite);
+
+    _decode(gather_set, bl, off);
+  }
+
+  char get_state() { return state; }
+  char set_state(char s) { 
+    state = s; 
+    assert(!is_stable() || gather_set.size() == 0);  // gather should be empty in stable states.
+    return s;
+  };
+
+  char get_replica_state() {
+    switch (state) {
+    case LOCK_LOCK:
+    case LOCK_GLOCKM:
+    case LOCK_GLOCKL:
+    case LOCK_GLOCKR: 
+    case LOCK_LONER:
+    case LOCK_GLONERR:
+    case LOCK_GLONERM:
+      return LOCK_LOCK;
+    case LOCK_MIXED:
+    case LOCK_GMIXEDR:
+      return LOCK_MIXED;
+    case LOCK_SYNC:
+      return LOCK_SYNC;
+
+      // after gather auth will bc LOCK_AC_MIXED or whatever
+    case LOCK_GSYNCM:
+      return LOCK_MIXED;
+    case LOCK_GSYNCL:
+    case LOCK_GMIXEDL:     // ** LOCK isn't exact right state, but works.
+      return LOCK_LOCK;
+
+    default: 
+      assert(0);
+    }
+    return 0;
+  }
+
+  // gather set
+  set<int>& get_gather_set() { return gather_set; }
+  void init_gather(set<int>& i) {
+    gather_set = i;
+  }
+  bool is_gathering(int i) {
+    return gather_set.count(i);
+  }
+  void clear_gather() {
+    gather_set.clear();
+  }
+
+  // ref counting
+  int get_read() { return ++nread; }
+  int put_read() {
+    assert(nread>0);
+    return --nread;
+  }
+  int get_nread() { return nread; }
+
+  int get_write() { return ++nwrite; }
+  int put_write() {
+    assert(nwrite>0);
+    return --nwrite;
+  }
+  int get_nwrite() { return nwrite; }
+  bool is_used() {
+    return (nwrite+nread)>0 ? true:false;
+  }
+
+  
+  // stable
+  bool is_stable() {
+    return (state == LOCK_SYNC) || 
+      (state == LOCK_LOCK) || 
+      (state == LOCK_MIXED) || 
+      (state == LOCK_LONER);
+  }
+
+  // read/write access
+  bool can_read(bool auth) {
+    if (auth)
+      return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) 
+        || (state == LOCK_GLOCKR) || (state == LOCK_LOCK);
+    else
+      return (state == LOCK_SYNC);
+  }
+  bool can_read_soon(bool auth) {
+    if (auth)
+      return (state == LOCK_GLOCKL);
+    else
+      return false;
+  }
+
+  bool can_write(bool auth) {
+    if (auth) 
+      return (state == LOCK_LOCK);
+    else
+      return false;
+  }
+  bool can_write_soon(bool auth) {
+    if (auth)
+      return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL)
+        || (state == LOCK_GLOCKM);
+    else
+      return false;
+  }
+
+  // client caps allowed
+  int caps_allowed_ever(bool auth) {
+    if (auth)
+      return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO;
+    else
+      return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+  }
+  int caps_allowed(bool auth) {
+    if (auth)
+      switch (state) {
+      case LOCK_SYNC:
+        return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+      case LOCK_LOCK:
+      case LOCK_GLOCKR:
+        return CAP_FILE_RDCACHE;
+
+      case LOCK_GLOCKL:
+      case LOCK_GLOCKM:
+        return 0;
+
+      case LOCK_MIXED:
+        return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO;
+      case LOCK_GMIXEDR:
+        return CAP_FILE_RD | CAP_FILE_LAZYIO;
+      case LOCK_GMIXEDL:
+        return 0;
+
+      case LOCK_LONER:  // single client writer, of course.
+        return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO;
+      case LOCK_GLONERR:
+        return CAP_FILE_RD | CAP_FILE_LAZYIO;
+      case LOCK_GLONERM:
+        return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO;
+
+      case LOCK_GSYNCL:
+        return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO;
+      case LOCK_GSYNCM:
+        return CAP_FILE_RD | CAP_FILE_LAZYIO;
+      }
+    else
+      switch (state) {
+      case LOCK_SYNC:
+        return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+      case LOCK_LOCK:
+      case LOCK_GLOCKR:
+        return CAP_FILE_RDCACHE;
+      case LOCK_GMIXEDR:
+      case LOCK_MIXED:
+        return CAP_FILE_RD | CAP_FILE_LAZYIO;
+      }
+    assert(0);
+    return 0;
+  }
+
+  friend class MDCache;
+  friend class Locker;
+  friend class Migrator;
+};
+
+//ostream& operator<<(ostream& out, CLock& l);
+inline ostream& operator<<(ostream& out, CLock& l) 
+{
+  static char* __lock_states[] = {
+    "sync",
+    "lock",
+    "glockr",
+    "glockl",
+    "glockm",
+    "mixed",
+    "gmixedr",
+    "gmixedl",
+    "loner",
+    "glonerr",
+    "glonerm",
+    "gsyncl",
+    "gsyncm"
+  }; 
+
+  out << "(" << __lock_states[(int)l.get_state()];
+
+  if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
+
+  if (l.get_nread()) 
+    out << " " << l.get_nread() << "r";
+  if (l.get_nwrite())
+    out << " " << l.get_nwrite() << "w";
+
+  // rw?
+  /*
+  out << " ";
+  if (l.can_read(true)) out << "r[" << l.get_nread() << "]";
+  if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]";
+  out << "/";
+  if (l.can_read(false)) out << "r[" << l.get_nread() << "]";
+  if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]";  
+  */
+  out << ")";
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc
new file mode 100644
index 0000000000000..0b4418fe2262d
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Locker.cc
@@ -0,0 +1,2286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "Server.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MDirUpdate.h"
+
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+#include "messages/MInodeUnlink.h"
+#include "messages/MInodeUnlinkAck.h"
+
+#include "messages/MLock.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientFileCaps.h"
+
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker "
+
+
+
+void Locker::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+
+    // locking
+  case MSG_MDS_LOCK:
+    handle_lock((MLock*)m);
+    break;
+
+    // cache fun
+  case MSG_MDS_INODEFILECAPS:
+    handle_inode_file_caps((MInodeFileCaps*)m);
+    break;
+
+  case MSG_CLIENT_FILECAPS:
+    handle_client_file_caps((MClientFileCaps*)m);
+    break;
+
+    
+
+  default:
+    assert(0);
+  }
+}
+
+
+
+
+// file i/o -----------------------------------------
+
+__uint64_t Locker::issue_file_data_version(CInode *in)
+{
+  dout(7) << "issue_file_data_version on " << *in << endl;
+  return in->inode.file_data_version;
+}
+
+
+Capability* Locker::issue_new_caps(CInode *in,
+                                    int mode,
+                                    MClientRequest *req)
+{
+  dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl;
+  
+  // my needs
+  int my_client = req->get_client();
+  int my_want = 0;
+  if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE  | CAP_FILE_RD;
+  if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR;
+
+  // register a capability
+  Capability *cap = in->get_client_cap(my_client);
+  if (!cap) {
+    // new cap
+    Capability c(my_want);
+    in->add_client_cap(my_client, c);
+    cap = in->get_client_cap(my_client);
+    
+    // note client addr
+    mds->clientmap.add_open(my_client, req->get_client_inst());
+    
+  } else {
+    // make sure it has sufficient caps
+    if (cap->wanted() & ~my_want) {
+      // augment wanted caps for this client
+      cap->set_wanted( cap->wanted() | my_want );
+    }
+  }
+
+  // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply)
+  cap->set_suppress(true);
+  int before = cap->pending();
+
+  if (in->is_auth()) {
+    // [auth] twiddle mode?
+    inode_file_eval(in);
+  } else {
+    // [replica] tell auth about any new caps wanted
+    request_inode_file_caps(in);
+  }
+    
+  // issue caps (pot. incl new one)
+  issue_caps(in);  // note: _eval above may have done this already...
+
+  // re-issue whatever we can
+  cap->issue(cap->pending());
+  
+  // ok, stop suppressing.
+  cap->set_suppress(false);
+
+  int now = cap->pending();
+  if (before != now &&
+      (before & CAP_FILE_WR) == 0 &&
+      (now & CAP_FILE_WR)) {
+    // FIXME FIXME FIXME
+  }
+  
+  // twiddle file_data_version?
+  if ((before & CAP_FILE_WRBUFFER) == 0 &&
+      (now & CAP_FILE_WRBUFFER)) {
+    in->inode.file_data_version++;
+    dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl;
+  }
+
+  return cap;
+}
+
+
+
+bool Locker::issue_caps(CInode *in)
+{
+  // allowed caps are determined by the lock mode.
+  int allowed = in->filelock.caps_allowed(in->is_auth());
+  dout(7) << "issue_caps filelock allows=" << cap_string(allowed) 
+          << " on " << *in << endl;
+
+  // count conflicts with
+  int nissued = 0;        
+
+  // client caps
+  for (map<int, Capability>::iterator it = in->client_caps.begin();
+       it != in->client_caps.end();
+       it++) {
+    if (it->second.issued() != (it->second.wanted() & allowed)) {
+      // issue
+      nissued++;
+
+      int before = it->second.pending();
+      long seq = it->second.issue(it->second.wanted() & allowed);
+      int after = it->second.pending();
+
+      // twiddle file_data_version?
+      if (!(before & CAP_FILE_WRBUFFER) &&
+          (after & CAP_FILE_WRBUFFER)) {
+        dout(7) << "   incrementing file_data_version for " << *in << endl;
+        in->inode.file_data_version++;
+      }
+
+      if (seq > 0 && 
+          !it->second.is_suppress()) {
+        dout(7) << "   sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl;
+        mds->messenger->send_message(new MClientFileCaps(in->inode,
+                                                         it->second.get_last_seq(),
+                                                         it->second.pending(),
+                                                         it->second.wanted()),
+                                     MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first), 
+				     0, MDS_PORT_LOCKER);
+      }
+    }
+  }
+
+  return (nissued == 0);  // true if no re-issued, no callbacks
+}
+
+
+
+void Locker::request_inode_file_caps(CInode *in)
+{
+  int wanted = in->get_caps_wanted();
+  if (wanted != in->replica_caps_wanted) {
+
+    if (wanted == 0) {
+      if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) {
+        // ok, release them finally!
+        in->replica_caps_wanted_keep_until.sec_ref() = 0;
+        dout(7) << "request_inode_file_caps " << cap_string(wanted)
+                 << " was " << cap_string(in->replica_caps_wanted) 
+                 << " no keeping anymore " 
+                 << " on " << *in 
+                 << endl;
+      }
+      else if (in->replica_caps_wanted_keep_until.sec() == 0) {
+        in->replica_caps_wanted_keep_until = g_clock.recent_now();
+        in->replica_caps_wanted_keep_until.sec_ref() += 2;
+        
+        dout(7) << "request_inode_file_caps " << cap_string(wanted)
+                 << " was " << cap_string(in->replica_caps_wanted) 
+                 << " keeping until " << in->replica_caps_wanted_keep_until
+                 << " on " << *in 
+                 << endl;
+        return;
+      } else {
+        // wait longer
+        return;
+      }
+    } else {
+      in->replica_caps_wanted_keep_until.sec_ref() = 0;
+    }
+    assert(!in->is_auth());
+
+    int auth = in->authority();
+    dout(7) << "request_inode_file_caps " << cap_string(wanted)
+            << " was " << cap_string(in->replica_caps_wanted) 
+            << " on " << *in << " to mds" << auth << endl;
+    assert(!in->is_auth());
+
+    in->replica_caps_wanted = wanted;
+    mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
+					     in->replica_caps_wanted),
+			  auth, MDS_PORT_LOCKER);
+  } else {
+    in->replica_caps_wanted_keep_until.sec_ref() = 0;
+  }
+}
+
+void Locker::handle_inode_file_caps(MInodeFileCaps *m)
+{
+  CInode *in = mdcache->get_inode(m->get_ino());
+  assert(in);
+  assert(in->is_auth() || in->is_proxy());
+  
+  dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
+
+  if (in->is_proxy()) {
+    dout(7) << "proxy, fw" << endl;
+    mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER);
+    return;
+  }
+
+  if (m->get_caps())
+    in->mds_caps_wanted[m->get_from()] = m->get_caps();
+  else
+    in->mds_caps_wanted.erase(m->get_from());
+
+  inode_file_eval(in);
+  delete m;
+}
+
+
+/*
+ * note: we only get these from the client if
+ * - we are calling back previously issued caps (fewer than the client previously had)
+ * - or if the client releases (any of) its caps on its own
+ */
+void Locker::handle_client_file_caps(MClientFileCaps *m)
+{
+  int client = MSG_ADDR_NUM(m->get_source());
+  CInode *in = mdcache->get_inode(m->get_ino());
+  Capability *cap = 0;
+  if (in) 
+    cap = in->get_client_cap(client);
+
+  if (!in || !cap) {
+    if (!in) {
+      dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl;
+    } else {
+      dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl;
+    }
+    delete m;
+    return;
+  } 
+  
+  assert(cap);
+
+  // filter wanted based on what we could ever give out (given auth/replica status)
+  int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth());
+  
+  dout(7) << "handle_client_file_caps seq " << m->get_seq() 
+          << " confirms caps " << cap_string(m->get_caps()) 
+          << " wants " << cap_string(wanted)
+          << " from client" << client
+          << " on " << *in 
+          << endl;  
+  
+  // update wanted
+  if (cap->wanted() != wanted)
+    cap->set_wanted(wanted);
+
+  // confirm caps
+  int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
+  int has = cap->confirmed();
+  if (cap->is_null()) {
+    dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl;
+    in->remove_client_cap(client);
+    if (!in->is_auth())
+      request_inode_file_caps(in);
+
+    // dec client addr counter
+    mds->clientmap.dec_open(client);
+
+    // tell client.
+    MClientFileCaps *r = new MClientFileCaps(in->inode, 
+                                             0, 0, 0,
+                                             MClientFileCaps::FILECAP_RELEASE);
+    mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER);
+  }
+
+  // merge in atime?
+  if (m->get_inode().atime > in->inode.atime) {
+      dout(7) << "  taking atime " << m->get_inode().atime << " > " 
+              << in->inode.atime << " for " << *in << endl;
+    in->inode.atime = m->get_inode().atime;
+  }
+  
+  if ((has|had) & CAP_FILE_WR) {
+    bool dirty = false;
+
+    // mtime
+    if (m->get_inode().mtime > in->inode.mtime) {
+      dout(7) << "  taking mtime " << m->get_inode().mtime << " > " 
+              << in->inode.mtime << " for " << *in << endl;
+      in->inode.mtime = m->get_inode().mtime;
+      dirty = true;
+    }
+    // size
+    if (m->get_inode().size > in->inode.size) {
+      dout(7) << "  taking size " << m->get_inode().size << " > " 
+              << in->inode.size << " for " << *in << endl;
+      in->inode.size = m->get_inode().size;
+      dirty = true;
+    }
+
+    if (dirty) 
+      mds->mdlog->submit_entry(new EInodeUpdate(in));
+  }  
+
+  // reevaluate, waiters
+  inode_file_eval(in);
+  in->finish_waiting(CINODE_WAIT_CAPS, 0);
+
+  delete m;
+}
+
+
+
+
+
+
+
+
+
+
+// locks ----------------------------------------------------------------
+
+/*
+
+
+INODES:
+
+= two types of inode metadata:
+   hard  - uid/gid, mode
+   file  - mtime, size
+ ? atime - atime  (*)       <-- we want a lazy update strategy?
+
+= correspondingly, two types of inode locks:
+   hardlock - hard metadata
+   filelock - file metadata
+
+   -> These locks are completely orthogonal! 
+
+= metadata ops and how they affect inode metadata:
+        sma=size mtime atime
+   HARD FILE OP
+  files:
+    R   RRR stat
+    RW      chmod/chown
+    R    W  touch   ?ctime
+    R       openr
+          W read    atime
+    R       openw
+    Wc      openwc  ?ctime
+        WW  write   size mtime
+            close 
+
+  dirs:
+    R     W readdir atime 
+        RRR  ( + implied stats on files)
+    Rc  WW  mkdir         (ctime on new dir, size+mtime on parent dir)
+    R   WW  link/unlink/rename/rmdir  (size+mtime on dir)
+
+  
+
+= relationship to client (writers):
+
+  - ops in question are
+    - stat ... need reasonable value for mtime (+ atime?)
+      - maybe we want a "quicksync" type operation instead of full lock
+    - truncate ... need to stop writers for the atomic truncate operation
+      - need a full lock
+
+
+
+
+= modes
+  - SYNC
+              Rauth  Rreplica  Wauth  Wreplica
+        sync
+        
+
+
+
+
+ALSO:
+
+  dirlock  - no dir changes (prior to unhashing)
+  denlock  - dentry lock    (prior to unlink, rename)
+
+     
+*/
+
+
+void Locker::handle_lock(MLock *m)
+{
+  switch (m->get_otype()) {
+  case LOCK_OTYPE_IHARD:
+    handle_lock_inode_hard(m);
+    break;
+    
+  case LOCK_OTYPE_IFILE:
+    handle_lock_inode_file(m);
+    break;
+    
+  case LOCK_OTYPE_DIR:
+    handle_lock_dir(m);
+    break;
+    
+  case LOCK_OTYPE_DN:
+    handle_lock_dn(m);
+    break;
+
+  default:
+    dout(7) << "handle_lock got otype " << m->get_otype() << endl;
+    assert(0);
+    break;
+  }
+}
+ 
+
+
+// ===============================
+// hard inode metadata
+
+bool Locker::inode_hard_read_try(CInode *in, Context *con)
+{
+  dout(7) << "inode_hard_read_try on " << *in << endl;  
+
+  // can read?  grab ref.
+  if (in->hardlock.can_read(in->is_auth())) 
+    return true;
+  
+  assert(!in->is_auth());
+
+  // wait!
+  dout(7) << "inode_hard_read_try waiting on " << *in << endl;
+  in->add_waiter(CINODE_WAIT_HARDR, con);
+  return false;
+}
+
+bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m)
+{
+  dout(7) << "inode_hard_read_start  on " << *in << endl;  
+
+  // can read?  grab ref.
+  if (in->hardlock.can_read(in->is_auth())) {
+    in->hardlock.get_read();
+    return true;
+  }
+  
+  // can't read, and replicated.
+  assert(!in->is_auth());
+
+  // wait!
+  dout(7) << "inode_hard_read_start waiting on " << *in << endl;
+  in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in));
+  return false;
+}
+
+
+void Locker::inode_hard_read_finish(CInode *in)
+{
+  // drop ref
+  assert(in->hardlock.can_read(in->is_auth()));
+  in->hardlock.put_read();
+
+  dout(7) << "inode_hard_read_finish on " << *in << endl;
+  
+  //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD);
+}
+
+
+bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m)
+{
+  dout(7) << "inode_hard_write_start  on " << *in << endl;
+
+  // if not replicated, i can twiddle lock at will
+  if (in->is_auth() &&
+      !in->is_cached_by_anyone() &&
+      in->hardlock.get_state() != LOCK_LOCK) 
+    in->hardlock.set_state(LOCK_LOCK);
+  
+  // can write?  grab ref.
+  if (in->hardlock.can_write(in->is_auth())) {
+    assert(in->is_auth());
+    if (!in->can_auth_pin()) {
+      dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl;
+      in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in));
+      return false;
+    }
+
+    in->auth_pin();  // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_*
+    in->hardlock.get_write();
+    return true;
+  }
+  
+  // can't write, replicated.
+  if (in->is_auth()) {
+    // auth
+    if (in->hardlock.can_write_soon(in->is_auth())) {
+      // just wait
+    } else {
+      // initiate lock
+      inode_hard_lock(in);
+    }
+    
+    dout(7) << "inode_hard_write_start waiting on " << *in << endl;
+    in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in));
+
+    return false;
+  } else {
+    // replica
+    // fw to auth
+    int auth = in->authority();
+    dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl;
+    assert(auth != mds->get_nodeid());
+    mdcache->request_forward(m, auth);
+    return false;
+  }
+}
+
+
+void Locker::inode_hard_write_finish(CInode *in)
+{
+  // drop ref
+  assert(in->hardlock.can_write(in->is_auth()));
+  in->hardlock.put_write();
+  in->auth_unpin();
+  dout(7) << "inode_hard_write_finish on " << *in << endl;
+  
+  // drop lock?
+  if (in->hardlock.get_nwrite() == 0) {
+
+    // auto-sync if alone.
+    if (in->is_auth() &&
+        !in->is_cached_by_anyone() &&
+        in->hardlock.get_state() != LOCK_SYNC) 
+      in->hardlock.set_state(LOCK_SYNC);
+    
+    inode_hard_eval(in);
+  }
+}
+
+
+void Locker::inode_hard_eval(CInode *in)
+{
+  // finished gather?
+  if (in->is_auth() &&
+      !in->hardlock.is_stable() &&
+      in->hardlock.gather_set.empty()) {
+    dout(7) << "inode_hard_eval finished gather on " << *in << endl;
+    switch (in->hardlock.get_state()) {
+    case LOCK_GLOCKR:
+      in->hardlock.set_state(LOCK_LOCK);
+      
+      // waiters
+      in->hardlock.get_write();
+      in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE);
+      in->hardlock.put_write();
+      break;
+      
+    default:
+      assert(0);
+    }
+  }
+  if (!in->hardlock.is_stable()) return;
+  
+  if (in->is_auth()) {
+
+    // sync?
+    if (in->is_cached_by_anyone() &&
+        in->hardlock.get_nwrite() == 0 &&
+        in->hardlock.get_state() != LOCK_SYNC) {
+      dout(7) << "inode_hard_eval stable, syncing " << *in << endl;
+      inode_hard_sync(in);
+    }
+
+  } else {
+    // replica
+  }
+}
+
+
+// mid
+
+void Locker::inode_hard_sync(CInode *in)
+{
+  dout(7) << "inode_hard_sync on " << *in << endl;
+  assert(in->is_auth());
+  
+  // check state
+  if (in->hardlock.get_state() == LOCK_SYNC)
+    return; // already sync
+  if (in->hardlock.get_state() == LOCK_GLOCKR) 
+    assert(0); // um... hmm!
+  assert(in->hardlock.get_state() == LOCK_LOCK);
+  
+  // hard data
+  bufferlist harddata;
+  in->encode_hard_state(harddata);
+  
+  // bcast to replicas
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+    m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+    m->set_data(harddata);
+    mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+  }
+  
+  // change lock
+  in->hardlock.set_state(LOCK_SYNC);
+  
+  // waiters?
+  in->finish_waiting(CINODE_WAIT_HARDSTABLE);
+}
+
+void Locker::inode_hard_lock(CInode *in)
+{
+  dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl;  
+  assert(in->is_auth());
+  
+  // check state
+  if (in->hardlock.get_state() == LOCK_LOCK ||
+      in->hardlock.get_state() == LOCK_GLOCKR) 
+    return;  // already lock or locking
+  assert(in->hardlock.get_state() == LOCK_SYNC);
+  
+  // bcast to replicas
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+    m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+    mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+  }
+  
+  // change lock
+  in->hardlock.set_state(LOCK_GLOCKR);
+  in->hardlock.init_gather(in->get_cached_by());
+}
+
+
+
+
+
+// messenger
+
+void Locker::handle_lock_inode_hard(MLock *m)
+{
+  assert(m->get_otype() == LOCK_OTYPE_IHARD);
+  
+  mds->logger->inc("lih");
+
+  int from = m->get_asker();
+  CInode *in = mdcache->get_inode(m->get_ino());
+  
+  if (LOCK_AC_FOR_AUTH(m->get_action())) {
+    // auth
+    assert(in);
+    assert(in->is_auth() || in->is_proxy());
+    dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl;  
+
+    if (in->is_proxy()) {
+      // fw
+      int newauth = in->authority();
+      assert(newauth >= 0);
+      if (from == newauth) {
+        dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+        delete m;
+      } else {
+        dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+        mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+      }
+      return;
+    }
+  } else {
+    // replica
+    if (!in) {
+      dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl;
+      /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness
+         to keep gather_set a proper/correct subset of cached_by.  better to use the existing
+         cacheexpire mechanism instead!
+      */
+      delete m;
+      return;
+    }
+    
+    assert(!in->is_auth());
+  }
+
+  dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl;  
+ 
+  CLock *lock = &in->hardlock;
+  
+  switch (m->get_action()) {
+    // -- replica --
+  case LOCK_AC_SYNC:
+    assert(lock->get_state() == LOCK_LOCK);
+    
+    { // assim data
+      int off = 0;
+      in->decode_hard_state(m->get_data(), off);
+    }
+    
+    // update lock
+    lock->set_state(LOCK_SYNC);
+    
+    // no need to reply
+    
+    // waiters
+    in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE);
+    break;
+    
+  case LOCK_AC_LOCK:
+    assert(lock->get_state() == LOCK_SYNC);
+    //||           lock->get_state() == LOCK_GLOCKR);
+    
+    // wait for readers to finish?
+    if (lock->get_nread() > 0) {
+      dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl;
+      lock->set_state(LOCK_GLOCKR);
+      in->add_waiter(CINODE_WAIT_HARDNORD,
+                     new C_MDS_RetryMessage(mds,m));
+      assert(0);  // does this ever happen?  (if so, fix hard_read_finish, and CInodeExport.update_inode!)
+      return;
+     } else {
+
+      // update lock and reply
+      lock->set_state(LOCK_LOCK);
+      
+      {
+        MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+        reply->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+        mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+      }
+    }
+    break;
+    
+    
+    // -- auth --
+  case LOCK_AC_LOCKACK:
+    assert(lock->state == LOCK_GLOCKR);
+    assert(lock->gather_set.count(from));
+    lock->gather_set.erase(from);
+
+    if (lock->gather_set.size()) {
+      dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+    } else {
+      dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl;
+      inode_hard_eval(in);
+    }
+  }  
+  delete m;
+}
+
+
+
+
+// =====================
+// soft inode metadata
+
+
+bool Locker::inode_file_read_start(CInode *in, MClientRequest *m)
+{
+  dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl;  
+
+  // can read?  grab ref.
+  if (in->filelock.can_read(in->is_auth())) {
+    in->filelock.get_read();
+    return true;
+  }
+  
+  // can't read, and replicated.
+  if (in->filelock.can_read_soon(in->is_auth())) {
+    // wait
+    dout(7) << "inode_file_read_start can_read_soon " << *in << endl;
+  } else {    
+    if (in->is_auth()) {
+      // auth
+
+      // FIXME or qsync?
+
+      if (in->filelock.is_stable()) {
+        inode_file_lock(in);     // lock, bc easiest to back off
+
+        if (in->filelock.can_read(in->is_auth())) {
+          in->filelock.get_read();
+          
+          in->filelock.get_write();
+          in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+          in->filelock.put_write();
+          return true;
+        }
+      } else {
+        dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+        in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+        return false;
+      }
+    } else {
+      // replica
+      if (in->filelock.is_stable()) {
+
+        // fw to auth
+        int auth = in->authority();
+        dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl;
+        assert(auth != mds->get_nodeid());
+        mdcache->request_forward(m, auth);
+        return false;
+        
+      } else {
+        // wait until stable
+        dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+        in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+        return false;
+      }
+    }
+  }
+
+  // wait
+  dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl;
+  in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in));
+        
+  return false;
+}
+
+
+void Locker::inode_file_read_finish(CInode *in)
+{
+  // drop ref
+  assert(in->filelock.can_read(in->is_auth()));
+  in->filelock.put_read();
+
+  dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl;
+
+  if (in->filelock.get_nread() == 0) {
+    in->finish_waiting(CINODE_WAIT_FILENORD);
+    inode_file_eval(in);
+  }
+}
+
+
+bool Locker::inode_file_write_start(CInode *in, MClientRequest *m)
+{
+  // can write?  grab ref.
+  if (in->filelock.can_write(in->is_auth())) {
+    in->filelock.get_write();
+    return true;
+  }
+  
+  // can't write, replicated.
+  if (in->is_auth()) {
+    // auth
+    if (in->filelock.can_write_soon(in->is_auth())) {
+      // just wait
+    } else {
+      if (!in->filelock.is_stable()) {
+        dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
+        in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+        return false;
+      }
+      
+      // initiate lock 
+      inode_file_lock(in);
+
+      if (in->filelock.can_write(in->is_auth())) {
+        in->filelock.get_write();
+        
+        in->filelock.get_read();
+        in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+        in->filelock.put_read();
+        return true;
+      }
+    }
+    
+    dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl;
+    in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in));
+    return false;
+  } else {
+    // replica
+    // fw to auth
+    int auth = in->authority();
+    dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
+    assert(auth != mds->get_nodeid());
+    mdcache->request_forward(m, auth);
+    return false;
+  }
+}
+
+
+void Locker::inode_file_write_finish(CInode *in)
+{
+  // drop ref
+  assert(in->filelock.can_write(in->is_auth()));
+  in->filelock.put_write();
+  dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
+  
+  // drop lock?
+  if (in->filelock.get_nwrite() == 0) {
+    in->finish_waiting(CINODE_WAIT_FILENOWR);
+    inode_file_eval(in);
+  }
+}
+
+
+/*
+ * ...
+ *
+ * also called after client caps are acked to us
+ * - checks if we're in unstable sfot state and can now move on to next state
+ * - checks if soft state should change (eg bc last writer closed)
+ */
+
+void Locker::inode_file_eval(CInode *in)
+{
+  int issued = in->get_caps_issued();
+
+  // [auth] finished gather?
+  if (in->is_auth() &&
+      !in->filelock.is_stable() &&
+      in->filelock.gather_set.size() == 0) {
+    dout(7) << "inode_file_eval finished mds gather on " << *in << endl;
+
+    switch (in->filelock.get_state()) {
+      // to lock
+    case LOCK_GLOCKR:
+    case LOCK_GLOCKM:
+    case LOCK_GLOCKL:
+      if (issued == 0) {
+        in->filelock.set_state(LOCK_LOCK);
+        
+        // waiters
+        in->filelock.get_read();
+        in->filelock.get_write();
+        in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+        in->filelock.put_read();
+        in->filelock.put_write();
+      }
+      break;
+      
+      // to mixed
+    case LOCK_GMIXEDR:
+      if ((issued & ~(CAP_FILE_RD)) == 0) {
+        in->filelock.set_state(LOCK_MIXED);
+        in->finish_waiting(CINODE_WAIT_FILESTABLE);
+      }
+      break;
+
+    case LOCK_GMIXEDL:
+      if ((issued & ~(CAP_FILE_WR)) == 0) {
+        in->filelock.set_state(LOCK_MIXED);
+
+        if (in->is_cached_by_anyone()) {
+          // data
+          bufferlist softdata;
+          in->encode_file_state(softdata);
+          
+          // bcast to replicas
+          for (set<int>::iterator it = in->cached_by_begin(); 
+               it != in->cached_by_end(); 
+               it++) {
+            MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+            m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+            m->set_data(softdata);
+            mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+          }
+        }
+
+        in->finish_waiting(CINODE_WAIT_FILESTABLE);
+      }
+      break;
+
+      // to loner
+    case LOCK_GLONERR:
+      if (issued == 0) {
+        in->filelock.set_state(LOCK_LONER);
+        in->finish_waiting(CINODE_WAIT_FILESTABLE);
+      }
+      break;
+
+    case LOCK_GLONERM:
+      if ((issued & ~CAP_FILE_WR) == 0) {
+        in->filelock.set_state(LOCK_LONER);
+        in->finish_waiting(CINODE_WAIT_FILESTABLE);
+      }
+      break;
+      
+      // to sync
+    case LOCK_GSYNCL:
+    case LOCK_GSYNCM:
+      if ((issued & ~(CAP_FILE_RD)) == 0) {
+        in->filelock.set_state(LOCK_SYNC);
+        
+        { // bcast data to replicas
+          bufferlist softdata;
+          in->encode_file_state(softdata);
+          
+          for (set<int>::iterator it = in->cached_by_begin(); 
+               it != in->cached_by_end(); 
+               it++) {
+            MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+            reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+            reply->set_data(softdata);
+            mds->send_message_mds(reply, *it, MDS_PORT_LOCKER);
+          }
+        }
+        
+        // waiters
+        in->filelock.get_read();
+        in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+        in->filelock.put_read();
+      }
+      break;
+      
+    default: 
+      assert(0);
+    }
+
+    issue_caps(in);
+  }
+  
+  // [replica] finished caps gather?
+  if (!in->is_auth() &&
+      !in->filelock.is_stable()) {
+    switch (in->filelock.get_state()) {
+    case LOCK_GMIXEDR:
+      if ((issued & ~(CAP_FILE_RD)) == 0) {
+        in->filelock.set_state(LOCK_MIXED);
+        
+        // ack
+        MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+        reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+      }
+      break;
+
+    case LOCK_GLOCKR:
+      if (issued == 0) {
+        in->filelock.set_state(LOCK_LOCK);
+        
+        // ack
+        MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+        reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+      }
+      break;
+
+    default:
+      assert(0);
+    }
+  }
+
+  // !stable -> do nothing.
+  if (!in->filelock.is_stable()) return; 
+
+
+  // stable.
+  assert(in->filelock.is_stable());
+
+  if (in->is_auth()) {
+    // [auth]
+    int wanted = in->get_caps_wanted();
+    bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty();
+    dout(7) << "inode_file_eval wanted=" << cap_string(wanted)
+            << "  filelock=" << in->filelock 
+            << "  loner=" << loner
+            << endl;
+
+    // * -> loner?
+    if (in->filelock.get_nread() == 0 &&
+        in->filelock.get_nwrite() == 0 &&
+        (wanted & CAP_FILE_WR) &&
+        loner &&
+        in->filelock.get_state() != LOCK_LONER) {
+      dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl;
+      inode_file_loner(in);
+    }
+
+    // * -> mixed?
+    else if (in->filelock.get_nread() == 0 &&
+             in->filelock.get_nwrite() == 0 &&
+             (wanted & CAP_FILE_RD) &&
+             (wanted & CAP_FILE_WR) &&
+             !(loner && in->filelock.get_state() == LOCK_LONER) &&
+             in->filelock.get_state() != LOCK_MIXED) {
+      dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl;
+      inode_file_mixed(in);
+    }
+
+    // * -> sync?
+    else if (in->filelock.get_nwrite() == 0 &&
+             !(wanted & CAP_FILE_WR) &&
+             ((wanted & CAP_FILE_RD) || 
+              in->is_cached_by_anyone() || 
+              (!loner && in->filelock.get_state() == LOCK_LONER)) &&
+             in->filelock.get_state() != LOCK_SYNC) {
+      dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl;
+      inode_file_sync(in);
+    }
+
+    // * -> lock?  (if not replicated or open)
+    else if (!in->is_cached_by_anyone() &&
+             wanted == 0 &&
+             in->filelock.get_state() != LOCK_LOCK) {
+      inode_file_lock(in);
+    }
+    
+  } else {
+    // replica
+    // recall? check wiaters?  XXX
+  }
+}
+
+
+// mid
+
+bool Locker::inode_file_sync(CInode *in)
+{
+  dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl;  
+
+  assert(in->is_auth());
+
+  // check state
+  if (in->filelock.get_state() == LOCK_SYNC ||
+      in->filelock.get_state() == LOCK_GSYNCL ||
+      in->filelock.get_state() == LOCK_GSYNCM)
+    return true;
+
+  assert(in->filelock.is_stable());
+
+  int issued = in->get_caps_issued();
+
+  assert((in->get_caps_wanted() & CAP_FILE_WR) == 0);
+
+  if (in->filelock.get_state() == LOCK_LOCK) {
+    if (in->is_cached_by_anyone()) {
+      // soft data
+      bufferlist softdata;
+      in->encode_file_state(softdata);
+      
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+	   it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+	m->set_data(softdata);
+	mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+    }
+
+    // change lock
+    in->filelock.set_state(LOCK_SYNC);
+
+    // reissue caps
+    issue_caps(in);
+    return true;
+  }
+
+  else if (in->filelock.get_state() == LOCK_MIXED) {
+    // writers?
+    if (issued & CAP_FILE_WR) {
+      // gather client write caps
+      in->filelock.set_state(LOCK_GSYNCM);
+      issue_caps(in);
+    } else {
+      // no writers, go straight to sync
+
+      if (in->is_cached_by_anyone()) {
+        // bcast to replicas
+        for (set<int>::iterator it = in->cached_by_begin(); 
+             it != in->cached_by_end(); 
+             it++) {
+          MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+          m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+          mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+        }
+      }
+    
+      // change lock
+      in->filelock.set_state(LOCK_SYNC);
+    }
+    return false;
+  }
+
+  else if (in->filelock.get_state() == LOCK_LONER) {
+    // writers?
+    if (issued & CAP_FILE_WR) {
+      // gather client write caps
+      in->filelock.set_state(LOCK_GSYNCL);
+      issue_caps(in);
+    } else {
+      // no writers, go straight to sync
+      if (in->is_cached_by_anyone()) {
+        // bcast to replicas
+        for (set<int>::iterator it = in->cached_by_begin(); 
+             it != in->cached_by_end(); 
+             it++) {
+          MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+          m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+          mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+        }
+      }
+
+      // change lock
+      in->filelock.set_state(LOCK_SYNC);
+    }
+    return false;
+  }
+  else 
+    assert(0); // wtf.
+
+  return false;
+}
+
+
+void Locker::inode_file_lock(CInode *in)
+{
+  dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl;  
+
+  assert(in->is_auth());
+  
+  // check state
+  if (in->filelock.get_state() == LOCK_LOCK ||
+      in->filelock.get_state() == LOCK_GLOCKR ||
+      in->filelock.get_state() == LOCK_GLOCKM ||
+      in->filelock.get_state() == LOCK_GLOCKL) 
+    return;  // lock or locking
+
+  assert(in->filelock.is_stable());
+
+  int issued = in->get_caps_issued();
+
+  if (in->filelock.get_state() == LOCK_SYNC) {
+    if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.init_gather(in->get_cached_by());
+      
+      // change lock
+      in->filelock.set_state(LOCK_GLOCKR);
+
+      // call back caps
+      if (issued) 
+        issue_caps(in);
+    } else {
+      if (issued) {
+        // call back caps
+        in->filelock.set_state(LOCK_GLOCKR);
+        issue_caps(in);
+      } else {
+        in->filelock.set_state(LOCK_LOCK);
+      }
+    }
+  }
+
+  else if (in->filelock.get_state() == LOCK_MIXED) {
+    if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.init_gather(in->get_cached_by());
+
+      // change lock
+      in->filelock.set_state(LOCK_GLOCKM);
+      
+      // call back caps
+      issue_caps(in);
+    } else {
+      //assert(issued);  // ??? -sage 2/19/06
+      if (issued) {
+        // change lock
+        in->filelock.set_state(LOCK_GLOCKM);
+        
+        // call back caps
+        issue_caps(in);
+      } else {
+        in->filelock.set_state(LOCK_LOCK);
+      }
+    }
+      
+  }
+  else if (in->filelock.get_state() == LOCK_LONER) {
+    if (issued & CAP_FILE_WR) {
+      // change lock
+      in->filelock.set_state(LOCK_GLOCKL);
+  
+      // call back caps
+      issue_caps(in);
+    } else {
+      in->filelock.set_state(LOCK_LOCK);
+    }
+  }
+  else 
+    assert(0); // wtf.
+}
+
+
+void Locker::inode_file_mixed(CInode *in)
+{
+  dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl;  
+
+  assert(in->is_auth());
+  
+  // check state
+  if (in->filelock.get_state() == LOCK_GMIXEDR ||
+      in->filelock.get_state() == LOCK_GMIXEDL)
+    return;     // mixed or mixing
+
+  assert(in->filelock.is_stable());
+
+  int issued = in->get_caps_issued();
+
+  if (in->filelock.get_state() == LOCK_SYNC) {
+    if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.init_gather(in->get_cached_by());
+    
+      in->filelock.set_state(LOCK_GMIXEDR);
+      issue_caps(in);
+    } else {
+      if (issued) {
+        in->filelock.set_state(LOCK_GMIXEDR);
+        issue_caps(in);
+      } else {
+        in->filelock.set_state(LOCK_MIXED);
+      }
+    }
+  }
+
+  else if (in->filelock.get_state() == LOCK_LOCK) {
+    if (in->is_cached_by_anyone()) {
+      // data
+      bufferlist softdata;
+      in->encode_file_state(softdata);
+      
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        m->set_data(softdata);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+    }
+
+    // change lock
+    in->filelock.set_state(LOCK_MIXED);
+    issue_caps(in);
+  }
+
+  else if (in->filelock.get_state() == LOCK_LONER) {
+    if (issued & CAP_FILE_WRBUFFER) {
+      // gather up WRBUFFER caps
+      in->filelock.set_state(LOCK_GMIXEDL);
+      issue_caps(in);
+    }
+    else if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.set_state(LOCK_MIXED);
+      issue_caps(in);
+    } else {
+      in->filelock.set_state(LOCK_MIXED);
+      issue_caps(in);
+    }
+  }
+
+  else 
+    assert(0); // wtf.
+}
+
+
+void Locker::inode_file_loner(CInode *in)
+{
+  dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl;  
+
+  assert(in->is_auth());
+
+  // check state
+  if (in->filelock.get_state() == LOCK_LONER ||
+      in->filelock.get_state() == LOCK_GLONERR ||
+      in->filelock.get_state() == LOCK_GLONERM)
+    return; 
+
+  assert(in->filelock.is_stable());
+  assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty());
+  
+  if (in->filelock.get_state() == LOCK_SYNC) {
+    if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.init_gather(in->get_cached_by());
+      
+      // change lock
+      in->filelock.set_state(LOCK_GLONERR);
+    } else {
+      // only one guy with file open, who gets it all, so
+      in->filelock.set_state(LOCK_LONER);
+      issue_caps(in);
+    }
+  }
+
+  else if (in->filelock.get_state() == LOCK_LOCK) {
+    // change lock.  ignore replicas; they don't know about LONER.
+    in->filelock.set_state(LOCK_LONER);
+    issue_caps(in);
+  }
+
+  else if (in->filelock.get_state() == LOCK_MIXED) {
+    if (in->is_cached_by_anyone()) {
+      // bcast to replicas
+      for (set<int>::iterator it = in->cached_by_begin(); 
+           it != in->cached_by_end(); 
+           it++) {
+        MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+        m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+      in->filelock.init_gather(in->get_cached_by());
+      
+      // change lock
+      in->filelock.set_state(LOCK_GLONERM);
+    } else {
+      in->filelock.set_state(LOCK_LONER);
+      issue_caps(in);
+    }
+  }
+
+  else 
+    assert(0);
+}
+
+// messenger
+
+void Locker::handle_lock_inode_file(MLock *m)
+{
+  assert(m->get_otype() == LOCK_OTYPE_IFILE);
+  
+  mds->logger->inc("lif");
+
+  CInode *in = mdcache->get_inode(m->get_ino());
+  int from = m->get_asker();
+
+  if (LOCK_AC_FOR_AUTH(m->get_action())) {
+    // auth
+    assert(in);
+    assert(in->is_auth() || in->is_proxy());
+    dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl;  
+        
+    if (in->is_proxy()) {
+      // fw
+      int newauth = in->authority();
+      assert(newauth >= 0);
+      if (from == newauth) {
+        dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+        delete m;
+      } else {
+        dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+        mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+      }
+      return;
+    }
+  } else {
+    // replica
+    if (!in) {
+      // drop it.  don't nak.
+      dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl;
+      delete m;
+      return;
+    }
+    
+    assert(!in->is_auth());
+  }
+
+  dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl;  
+  
+  CLock *lock = &in->filelock;
+  int issued = in->get_caps_issued();
+
+  switch (m->get_action()) {
+    // -- replica --
+  case LOCK_AC_SYNC:
+    assert(lock->get_state() == LOCK_LOCK ||
+           lock->get_state() == LOCK_MIXED);
+    
+    { // assim data
+      int off = 0;
+      in->decode_file_state(m->get_data(), off);
+    }
+    
+    // update lock
+    lock->set_state(LOCK_SYNC);
+    
+    // no need to reply.
+    
+    // waiters
+    in->filelock.get_read();
+    in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+    in->filelock.put_read();
+    inode_file_eval(in);
+    break;
+    
+  case LOCK_AC_LOCK:
+    assert(lock->get_state() == LOCK_SYNC ||
+           lock->get_state() == LOCK_MIXED);
+    
+    // call back caps?
+    if (issued & CAP_FILE_RD) {
+      dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl;
+      issue_caps(in);
+    }
+    if (lock->get_nread() > 0) {
+      dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl;
+      in->add_waiter(CINODE_WAIT_FILENORD,
+                     new C_MDS_RetryMessage(mds,m));
+      lock->set_state(LOCK_GLOCKR);
+      assert(0);// i am broken.. why retry message when state captures all the info i need?
+      return;
+    } 
+    if (issued & CAP_FILE_RD) {
+      lock->set_state(LOCK_GLOCKR);
+      break;
+    }
+
+    // nothing to wait for, lock and ack.
+    {
+      lock->set_state(LOCK_LOCK);
+
+      MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+      reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+      mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+    }
+    break;
+    
+  case LOCK_AC_MIXED:
+    assert(lock->get_state() == LOCK_SYNC ||
+           lock->get_state() == LOCK_LOCK);
+    
+    if (lock->get_state() == LOCK_SYNC) {
+      // MIXED
+      if (issued & CAP_FILE_RD) {
+        // call back client caps
+        lock->set_state(LOCK_GMIXEDR);
+        issue_caps(in);
+        break;
+      } else {
+        // no clients, go straight to mixed
+        lock->set_state(LOCK_MIXED);
+
+        // ack
+        MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+        reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+        mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+      }
+    } else {
+      // LOCK
+      lock->set_state(LOCK_MIXED);
+      
+      // no ack needed.
+    }
+
+    issue_caps(in);
+    
+    // waiters
+    in->filelock.get_write();
+    in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE);
+    in->filelock.put_write();
+    inode_file_eval(in);
+    break;
+
+ 
+    
+
+    // -- auth --
+  case LOCK_AC_LOCKACK:
+    assert(lock->state == LOCK_GLOCKR ||
+           lock->state == LOCK_GLOCKM ||
+           lock->state == LOCK_GLONERM ||
+           lock->state == LOCK_GLONERR);
+    assert(lock->gather_set.count(from));
+    lock->gather_set.erase(from);
+
+    if (lock->gather_set.size()) {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+    } else {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+      inode_file_eval(in);
+    }
+    break;
+    
+  case LOCK_AC_SYNCACK:
+    assert(lock->state == LOCK_GSYNCM);
+    assert(lock->gather_set.count(from));
+    lock->gather_set.erase(from);
+    
+    /* not used currently
+    {
+      // merge data  (keep largest size, mtime, etc.)
+      int off = 0;
+      in->decode_merge_file_state(m->get_data(), off);
+    }
+    */
+
+    if (lock->gather_set.size()) {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+    } else {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+      inode_file_eval(in);
+    }
+    break;
+
+  case LOCK_AC_MIXEDACK:
+    assert(lock->state == LOCK_GMIXEDR);
+    assert(lock->gather_set.count(from));
+    lock->gather_set.erase(from);
+    
+    if (lock->gather_set.size()) {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+    } else {
+      dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+      inode_file_eval(in);
+    }
+    break;
+
+
+  default:
+    assert(0);
+  }  
+  
+  delete m;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+void Locker::handle_lock_dir(MLock *m) 
+{
+
+}
+
+
+
+// DENTRY
+
+bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref)
+{
+  dout(7) << "dentry_xlock_start on " << *dn << endl;
+
+  // locked?
+  if (dn->lockstate == DN_LOCK_XLOCK) {
+    if (dn->xlockedby == m) return true;  // locked by me!
+
+    // not by me, wait
+    dout(7) << "dentry " << *dn << " xlock by someone else" << endl;
+    dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+                        new C_MDS_RetryRequest(mds,m,ref));
+    return false;
+  }
+
+  // prelock?
+  if (dn->lockstate == DN_LOCK_PREXLOCK) {
+    if (dn->xlockedby == m) {
+      dout(7) << "dentry " << *dn << " prexlock by me" << endl;
+      dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+                          new C_MDS_RetryRequest(mds,m,ref));
+    } else {
+      dout(7) << "dentry " << *dn << " prexlock by someone else" << endl;
+      dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+                          new C_MDS_RetryRequest(mds,m,ref));
+    }
+    return false;
+  }
+
+
+  // lockable!
+  assert(dn->lockstate == DN_LOCK_SYNC ||
+         dn->lockstate == DN_LOCK_UNPINNING);
+  
+  // dir auth pinnable?
+  if (!dn->dir->can_auth_pin()) {
+    dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl;
+    dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+                        new C_MDS_RetryRequest(mds,m,ref));
+    return false;
+  }
+
+  // is dentry path pinned?
+  if (dn->is_pinned()) {
+    dout(7) << "dentry " << *dn << " pinned, waiting" << endl;
+    dn->lockstate = DN_LOCK_UNPINNING;
+    dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+                        dn->name,
+                        new C_MDS_RetryRequest(mds,m,ref));
+    return false;
+  }
+
+  // pin path up to dentry!            (if success, point of no return)
+  CDentry *pdn = dn->dir->inode->get_parent_dn();
+  if (pdn) {
+    if (mdcache->active_requests[m].traces.count(pdn)) {
+      dout(7) << "already path pinned parent dentry " << *pdn << endl;
+    } else {
+      dout(7) << "pinning parent dentry " << *pdn << endl;
+      vector<CDentry*> trace;
+      mdcache->make_trace(trace, pdn->inode);
+      assert(trace.size());
+
+      if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false;
+      
+      mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace;
+    }
+  }
+
+  // pin dir!
+  dn->dir->auth_pin();
+  
+  // mine!
+  dn->xlockedby = m;
+
+  if (dn->dir->is_open_by_anyone()) {
+    dn->lockstate = DN_LOCK_PREXLOCK;
+    
+    // xlock with whom?
+    set<int> who = dn->dir->get_open_by();
+    dn->gather_set = who;
+
+    // make path
+    string path;
+    dn->make_path(path);
+    dout(10) << "path is " << path << " for " << *dn << endl;
+
+    for (set<int>::iterator it = who.begin();
+         it != who.end();
+         it++) {
+      MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+      m->set_dn(dn->dir->ino(), dn->name);
+      m->set_path(path);
+      mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+    }
+
+    // wait
+    dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl;
+    dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+                        new C_MDS_RetryRequest(mds, m, ref));
+    return false;
+  } else {
+    dn->lockstate = DN_LOCK_XLOCK;
+    mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+    return true;
+  }
+}
+
+void Locker::dentry_xlock_finish(CDentry *dn, bool quiet)
+{
+  dout(7) << "dentry_xlock_finish on " << *dn << endl;
+  
+  assert(dn->xlockedby);
+  if (dn->xlockedby == DN_XLOCK_FOREIGN) {
+    dout(7) << "this was a foreign xlock" << endl;
+  } else {
+    // remove from request record
+    assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1);
+    mdcache->active_requests[dn->xlockedby].xlocks.erase(dn);
+  }
+
+  dn->xlockedby = 0;
+  dn->lockstate = DN_LOCK_SYNC;
+
+  // unpin parent dir?
+  // -> no?  because we might have xlocked 2 things in this dir.
+  //         instead, we let request_finish clean up the mess.
+    
+  // tell replicas?
+  if (!quiet) {
+    // tell even if dn is null.
+    if (dn->dir->is_open_by_anyone()) {
+      for (set<int>::iterator it = dn->dir->open_by_begin();
+           it != dn->dir->open_by_end();
+           it++) {
+        MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+        m->set_dn(dn->dir->ino(), dn->name);
+        mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+      }
+    }
+  }
+  
+  // unpin dir
+  dn->dir->auth_unpin();
+}
+
+/*
+ * onfinish->finish() will be called with 
+ * 0 on successful xlock,
+ * -1 on failure
+ */
+
+class C_MDC_XlockRequest : public Context {
+  Locker *mdc;
+  CDir *dir;
+  string dname;
+  Message *req;
+  Context *finisher;
+public:
+  C_MDC_XlockRequest(Locker *mdc, 
+                     CDir *dir, string& dname, 
+                     Message *req,
+                     Context *finisher) {
+    this->mdc = mdc;
+    this->dir = dir;
+    this->dname = dname;
+    this->req = req;
+    this->finisher = finisher;
+  }
+
+  void finish(int r) {
+    mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher);
+  }
+};
+
+void Locker::dentry_xlock_request_finish(int r, 
+					  CDir *dir, string& dname, 
+					  Message *req,
+					  Context *finisher) 
+{
+  dout(10) << "dentry_xlock_request_finish r = " << r << endl;
+  if (r == 1) {  // 1 for xlock request success
+    CDentry *dn = dir->lookup(dname);
+    if (dn && dn->xlockedby == 0) {
+      // success
+      dn->xlockedby = req;   // our request was the winner
+      dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl;
+      
+      // remember!
+      mdcache->active_requests[req].foreign_xlocks.insert(dn);
+    }        
+  }
+  
+  // retry request (or whatever)
+  finisher->finish(0);
+  delete finisher;
+}
+
+void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create,
+                                   Message *req, Context *onfinish)
+{
+  dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; 
+  // send request
+  int dauth = dir->dentry_authority(dname);
+  MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid());
+  m->set_dn(dir->ino(), dname);
+  mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+  
+  // add waiter
+  dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname, 
+                  new C_MDC_XlockRequest(this, 
+                                         dir, dname, req,
+                                         onfinish));
+}
+
+
+
+
+void Locker::handle_lock_dn(MLock *m)
+{
+  assert(m->get_otype() == LOCK_OTYPE_DN);
+  
+  CInode *diri = mdcache->get_inode(m->get_ino());  // may be null 
+  CDir *dir = 0;
+  if (diri) dir = diri->dir;           // may be null
+  string dname = m->get_dn();
+  int from = m->get_asker();
+  CDentry *dn = 0;
+
+  if (LOCK_AC_FOR_AUTH(m->get_action())) {
+    // auth
+
+    // normally we have it always
+    if (diri && dir) {
+      int dauth = dir->dentry_authority(dname);
+      assert(dauth == mds->get_nodeid() || dir->is_proxy() ||  // mine or proxy,
+             m->get_action() == LOCK_AC_REQXLOCKACK ||         // or we did a REQXLOCK and this is our ack/nak
+             m->get_action() == LOCK_AC_REQXLOCKNAK);
+      
+      if (dir->is_proxy()) {
+
+        assert(dauth >= 0);
+
+        if (dauth == m->get_asker() && 
+            (m->get_action() == LOCK_AC_REQXLOCK ||
+             m->get_action() == LOCK_AC_REQXLOCKC)) {
+          dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl;
+          if (mdcache->active_requests.count(m)) 
+            mdcache->request_finish(m);
+          else
+            delete m;
+          return;
+        }
+
+        dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl;
+
+        // forward
+        if (mdcache->active_requests.count(m)) {
+          // xlock requests are requests, use request_* functions!
+          assert(m->get_action() == LOCK_AC_REQXLOCK ||
+                 m->get_action() == LOCK_AC_REQXLOCKC);
+          // forward as a request
+          mdcache->request_forward(m, dauth, MDS_PORT_LOCKER);
+        } else {
+          // not an xlock req, or it is and we just didn't register the request yet
+          // forward normally
+          mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+        }
+        return;
+      }
+      
+      dn = dir->lookup(dname);
+    }
+
+    // except with.. an xlock request?
+    if (!dn) {
+      assert(dir);  // we should still have the dir, though!  the requester has the dir open.
+      switch (m->get_action()) {
+
+      case LOCK_AC_LOCK:
+        dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl;
+        dn = dir->add_dentry(dname);
+        break;
+
+      case LOCK_AC_REQXLOCK:
+        // send nak
+        if (dir->state_test(CDIR_STATE_DELETED)) {
+          dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl;
+        } else {
+          dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl;
+        }
+        {
+          MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+          reply->set_dn(dir->ino(), dname);
+          reply->set_path(m->get_path());
+          mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+        }
+         
+        // finish request (if we got that far)
+        if (mdcache->active_requests.count(m)) 
+	  mdcache->request_finish(m);
+
+        delete m;
+        return;
+
+      case LOCK_AC_REQXLOCKC:
+        dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl;
+        break;
+
+      default:
+        assert(0);
+      }
+    }
+  } else {
+    // replica
+    if (dir) dn = dir->lookup(dname);
+    if (!dn) {
+      dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl;
+      
+      if (m->get_action() == LOCK_AC_REQXLOCKACK ||
+          m->get_action() == LOCK_AC_REQXLOCKNAK) {
+        dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl;
+        //assert(0);  // how can this happen?  tell me now!
+        
+        vector<CDentry*> trace;
+        filepath path = m->get_path();
+        int r = mdcache->path_traverse(path, trace, true,
+				       m, new C_MDS_RetryMessage(mds,m), 
+				       MDS_TRAVERSE_DISCOVER);
+        assert(r>0);
+        return;
+      } 
+
+      if (m->get_action() == LOCK_AC_LOCK) {
+        if (0) { // not anymore
+          dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl;
+          
+          vector<CDentry*> trace;
+          filepath path = m->get_path();
+          int r = mdcache->path_traverse(path, trace, true,
+					 m, new C_MDS_RetryMessage(mds,m), 
+					 MDS_TRAVERSE_DISCOVER);
+          assert(r>0);
+        }
+        if (1) {
+          // NAK
+          MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid());
+          reply->set_dn(m->get_ino(), dname);
+          mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+        }
+      } else {
+        dout(7) << "safely ignoring." << endl;
+        delete m;
+      }
+      return;
+    }
+
+    assert(dn);
+  }
+
+  if (dn) {
+    dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl;
+  } else {
+    dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl;
+  }
+  
+  switch (m->get_action()) {
+    // -- replica --
+  case LOCK_AC_LOCK:
+    assert(dn->lockstate == DN_LOCK_SYNC ||
+           dn->lockstate == DN_LOCK_UNPINNING ||
+           dn->lockstate == DN_LOCK_XLOCK);   // <-- bc the handle_lock_dn did the discover!
+
+    if (dn->is_pinned()) {
+      dn->lockstate = DN_LOCK_UNPINNING;
+
+      // wait
+      dout(7) << "dn pinned, waiting " << *dn << endl;
+      dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+                          dn->name,
+                          new C_MDS_RetryMessage(mds, m));
+      return;
+    } else {
+      dn->lockstate = DN_LOCK_XLOCK;
+      dn->xlockedby = 0;
+
+      // ack now
+      MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+      reply->set_dn(diri->ino(), dname);
+      mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+    }
+
+    // wake up waiters
+    dir->finish_waiting(CDIR_WAIT_DNLOCK, dname);   // ? will this happen on replica ? 
+    break;
+
+  case LOCK_AC_SYNC:
+    assert(dn->lockstate == DN_LOCK_XLOCK);
+    dn->lockstate = DN_LOCK_SYNC;
+    dn->xlockedby = 0;
+
+    // null?  hose it.
+    if (dn->is_null()) {
+      dout(7) << "hosing null (and now sync) dentry " << *dn << endl;
+      dir->remove_dentry(dn);
+    }
+
+    // wake up waiters
+    dir->finish_waiting(CDIR_WAIT_DNREAD, dname);   // will this happen either?  YES: if a rename lock backs out
+    break;
+
+  case LOCK_AC_REQXLOCKACK:
+  case LOCK_AC_REQXLOCKNAK:
+    {
+      dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl;
+      list<Context*> finished;
+      dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1);  // TAKE ONE ONLY!
+      finish_contexts(finished, 
+                      (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1);
+    }
+    break;
+
+
+    // -- auth --
+  case LOCK_AC_LOCKACK:
+  case LOCK_AC_LOCKNAK:
+    assert(dn->gather_set.count(from) == 1);
+    dn->gather_set.erase(from);
+    if (dn->gather_set.size() == 0) {
+      dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl;
+      dn->lockstate = DN_LOCK_XLOCK;
+      mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+      dir->finish_waiting(CDIR_WAIT_DNLOCK, dname);
+    }
+    break;
+
+
+  case LOCK_AC_REQXLOCKC:
+    // make sure it's a _file_, if it exists.
+    if (dn && dn->inode && dn->inode->is_dir()) {
+      dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl;
+      
+      // nak
+      string path;
+      dn->make_path(path);
+
+      MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+      reply->set_dn(dir->ino(), dname);
+      reply->set_path(path);
+      mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+      
+      // done
+      if (mdcache->active_requests.count(m)) 
+        mdcache->request_finish(m);
+      else
+        delete m;
+      return;
+    }
+
+  case LOCK_AC_REQXLOCK:
+    if (dn) {
+      dout(7) << "handle_lock_dn reqxlock on " << *dn << endl;
+    } else {
+      dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl;      
+    }
+    
+
+    // start request?
+    if (!mdcache->active_requests.count(m)) {
+      vector<CDentry*> trace;
+      if (!mdcache->request_start(m, dir->inode, trace))
+        return;  // waiting for pin
+    }
+    
+    // try to xlock!
+    if (!dn) {
+      assert(m->get_action() == LOCK_AC_REQXLOCKC);
+      dn = dir->add_dentry(dname);
+    }
+
+    if (dn->xlockedby != m) {
+      if (!dentry_xlock_start(dn, m, dir->inode)) {
+        // hose null dn if we're waiting on something
+        if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+        return;    // waiting for xlock
+      }
+    } else {
+      // successfully xlocked!  on behalf of requestor.
+      string path;
+      dn->make_path(path);
+
+      dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl;
+      
+      // ACK xlock request
+      MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid());
+      reply->set_dn(dir->ino(), dname);
+      reply->set_path(path);
+      mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+
+      // note: keep request around in memory (to hold the xlock/pins on behalf of requester)
+      return;
+    }
+    break;
+
+  case LOCK_AC_UNXLOCK:
+    dout(7) << "handle_lock_dn unxlock on " << *dn << endl;
+    {
+      string dname = dn->name;
+      Message *m = dn->xlockedby;
+
+      // finish request
+      mdcache->request_finish(m);  // this will drop the locks (and unpin paths!)
+      return;
+    }
+    break;
+
+  default:
+    assert(0);
+  }
+
+  delete m;
+}
+
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h
new file mode 100644
index 0000000000000..20b5a17896610
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Locker.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_LOCKER_H
+#define __MDS_LOCKER_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class Message;
+
+class MDiscover;
+class MDiscoverReply;
+class MCacheExpire;
+class MDirUpdate;
+class MDentryUnlink;
+class MLock;
+
+class MClientRequest;
+
+
+class Anchor;
+class Capability;
+
+
+class Locker {
+private:
+  MDS *mds;
+  MDCache *mdcache;
+ 
+ public:
+  Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {}  
+
+  void dispatch(Message *m);
+
+  // -- locks --
+  // high level interface
+ public:
+  bool inode_hard_read_try(CInode *in, Context *con);
+  bool inode_hard_read_start(CInode *in, MClientRequest *m);
+  void inode_hard_read_finish(CInode *in);
+  bool inode_hard_write_start(CInode *in, MClientRequest *m);
+  void inode_hard_write_finish(CInode *in);
+  bool inode_file_read_start(CInode *in, MClientRequest *m);
+  void inode_file_read_finish(CInode *in);
+  bool inode_file_write_start(CInode *in, MClientRequest *m);
+  void inode_file_write_finish(CInode *in);
+
+  void inode_hard_eval(CInode *in);
+  void inode_file_eval(CInode *in);
+
+ protected:
+  void inode_hard_mode(CInode *in, int mode);
+  void inode_file_mode(CInode *in, int mode);
+
+  // low level triggers
+  void inode_hard_sync(CInode *in);
+  void inode_hard_lock(CInode *in);
+  bool inode_file_sync(CInode *in);
+  void inode_file_lock(CInode *in);
+  void inode_file_mixed(CInode *in);
+  void inode_file_loner(CInode *in);
+
+  // messengers
+  void handle_lock(MLock *m);
+  void handle_lock_inode_hard(MLock *m);
+  void handle_lock_inode_file(MLock *m);
+
+  // -- file i/o --
+ public:
+  version_t issue_file_data_version(CInode *in);
+  Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req);
+  bool issue_caps(CInode *in);
+
+ protected:
+  void handle_client_file_caps(class MClientFileCaps *m);
+
+  void request_inode_file_caps(CInode *in);
+  void handle_inode_file_caps(class MInodeFileCaps *m);
+
+
+  // dirs
+  void handle_lock_dir(MLock *m);
+
+  // dentry locks
+ public:
+  bool dentry_xlock_start(CDentry *dn, 
+                          Message *m, CInode *ref);
+  void dentry_xlock_finish(CDentry *dn, bool quiet=false);
+  void handle_lock_dn(MLock *m);
+  void dentry_xlock_request(CDir *dir, string& dname, bool create,
+                            Message *req, Context *onfinish);
+  void dentry_xlock_request_finish(int r,
+				   CDir *dir, string& dname, 
+				   Message *req,
+				   Context *finisher);
+
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/LogEvent.cc b/branches/sage/cephmds2/mds/LogEvent.cc
new file mode 100644
index 0000000000000..5b15f487d77ab
--- /dev/null
+++ b/branches/sage/cephmds2/mds/LogEvent.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "LogEvent.h"
+
+#include "MDS.h"
+
+// events i know of
+#include "events/EString.h"
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+#include "events/EAlloc.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+#include "events/EPurgeFinish.h"
+
+LogEvent *LogEvent::decode(bufferlist& bl)
+{
+  // parse type, length
+  int off = 0;
+  int type;
+  bl.copy(off, sizeof(type), (char*)&type);
+  off += sizeof(type);
+
+  int length = bl.length() - off;
+  dout(15) << "decode_log_event type " << type << ", size " << length << endl;
+  
+  assert(type > 0);
+  
+  // create event
+  LogEvent *le;
+  switch (type) {
+  case EVENT_STRING:  // string
+    le = new EString();
+    break;
+    
+  case EVENT_INODEUPDATE:
+    le = new EInodeUpdate();
+    break;
+    
+  case EVENT_DIRUPDATE:
+    le = new EDirUpdate();
+    break;
+    
+  case EVENT_UNLINK:
+    le = new EUnlink();
+    break;
+
+  case EVENT_PURGEFINISH:
+    le = new EPurgeFinish();
+    break;
+    
+  case EVENT_ALLOC:
+    le = new EAlloc();
+    break;
+
+  case EVENT_MKNOD:
+    le = new EMknod();
+    break;
+
+  case EVENT_MKDIR:
+    le = new EMkdir();
+    break;
+
+  default:
+    dout(1) << "uh oh, unknown event type " << type << endl;
+    assert(0);
+  }
+
+  // decode
+  le->decode_payload(bl, off);
+  
+  return le;
+}
+
diff --git a/branches/sage/cephmds2/mds/LogEvent.h b/branches/sage/cephmds2/mds/LogEvent.h
new file mode 100644
index 0000000000000..0de268252036a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/LogEvent.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __LOGEVENT_H
+#define __LOGEVENT_H
+
+#define EVENT_STRING       1
+
+#define EVENT_INODEUPDATE  2
+#define EVENT_DIRUPDATE    3
+
+#define EVENT_ALLOC        10
+#define EVENT_MKNOD        11
+#define EVENT_MKDIR        12
+#define EVENT_LINK         13
+
+#define EVENT_UNLINK       20
+#define EVENT_RMDIR        21
+#define EVENT_PURGEFINISH  22
+
+
+#include <string>
+using namespace std;
+
+#include "include/buffer.h"
+#include "include/Context.h"
+
+class MDS;
+
+// generic log event
+class LogEvent {
+ private:
+  int _type;
+  off_t _end_off;
+  friend class MDLog;
+
+ public:
+  LogEvent(int t) : _type(t), _end_off(0) { }
+  virtual ~LogEvent() { }
+
+  // encoding
+  virtual void encode_payload(bufferlist& bl) = 0;
+  virtual void decode_payload(bufferlist& bl, int& off) = 0;
+  static LogEvent *decode(bufferlist &bl);
+
+
+  virtual void print(ostream& out) { 
+    out << "event(" << _type << ")";
+  }
+
+
+  /*** live journal ***/
+
+  /* obsolete() - is this entry committed to primary store, such that
+   *   we can expire it from the journal?
+   */
+  virtual bool can_expire(MDS *m) {
+    return true;
+  }
+  
+  /* retire() - prod MDS into committing hte relevant state so that this
+   *   entry can be expired from the jorunal.
+   */
+  virtual void retire(MDS *m, Context *c) {
+    c->finish(0);
+    delete c;
+  }
+
+  
+  /*** recovery ***/
+
+  /* has_happened() - true if this event has already been applied.
+   */
+  virtual bool has_happened(MDS *m) { return true; }
+
+  /* replay() - replay given event
+   */
+  virtual void replay(MDS *m) { assert(0); }
+
+};
+
+inline ostream& operator<<(ostream& out, LogEvent& le) {
+  le.print(out);
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDBalancer.cc b/branches/sage/cephmds2/mds/MDBalancer.cc
new file mode 100644
index 0000000000000..0b497103183b2
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDBalancer.cc
@@ -0,0 +1,902 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "mdstypes.h"
+
+#include "MDBalancer.h"
+#include "MDS.h"
+#include "MDSMap.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDCache.h"
+#include "Migrator.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+#include "messages/MHeartbeat.h"
+
+#include <vector>
+#include <map>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << "mds" << mds->get_nodeid() << ".bal " << (g_clock.recent_now() - mds->logger->get_start()) << " "
+
+#define MIN_LOAD    50   //  ??
+#define MIN_REEXPORT 5  // will automatically reexport
+#define MIN_OFFLOAD 10   // point at which i stop trying, close enough
+
+
+int MDBalancer::proc_message(Message *m)
+{
+  switch (m->get_type()) {
+
+  case MSG_MDS_HEARTBEAT:
+    handle_heartbeat((MHeartbeat*)m);
+    break;
+    
+  default:
+    dout(1) << " balancer unknown message " << m->get_type() << endl;
+    assert(0);
+    break;
+  }
+
+  return 0;
+}
+
+
+class C_Bal_SendHeartbeat : public Context {
+public:
+  MDS *mds;
+  C_Bal_SendHeartbeat(MDS *mds) {
+    this->mds = mds;
+  }
+  virtual void finish(int f) {
+    mds->balancer->send_heartbeat();
+  }
+};
+
+mds_load_t MDBalancer::get_load()
+{
+  mds_load_t load;
+  if (mds->mdcache->get_root()) 
+    load.root = 
+      mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM];
+  // +
+  //  mds->mdcache->get_root()->popularity[MDS_POP_NESTED];
+
+  load.req_rate = mds->get_req_rate();
+  load.queue_len = mds->messenger->get_dispatch_queue_len();
+  return load;
+}
+
+void MDBalancer::send_heartbeat()
+{
+  if (!mds->mdcache->get_root()) {
+    dout(5) << "no root on send_heartbeat" << endl;
+    mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds));
+    return;
+  }
+
+  mds_load.clear();
+  if (mds->get_nodeid() == 0)
+    beat_epoch++;
+
+  // load
+  mds_load_t load = get_load();
+  mds_load[ mds->get_nodeid() ] = load;
+
+  // import_map
+  map<int, float> import_map;
+
+  for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+       it != mds->mdcache->imports.end();
+       it++) {
+    CDir *im = *it;
+    if (im->inode->is_root()) continue;
+    int from = im->inode->authority();
+    import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load();
+  }
+  mds_import_map[ mds->get_nodeid() ] = import_map;
+
+  
+  dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl;
+  for (map<int, float>::iterator it = import_map.begin();
+       it != import_map.end();
+       it++) {
+    dout(5) << "  import_map from " << it->first << " -> " << it->second << endl;
+  }
+
+  
+  int size = mds->get_mds_map()->get_num_mds();
+  for (int i = 0; i<size; i++) {
+    if (i == mds->get_nodeid()) continue;
+    MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
+    hb->get_import_map() = import_map;
+    mds->messenger->send_message(hb,
+                                 MSG_ADDR_MDS(i), MDS_PORT_BALANCER,
+                                 MDS_PORT_BALANCER);
+  }
+}
+
+void MDBalancer::handle_heartbeat(MHeartbeat *m)
+{
+  dout(25) << "=== got heartbeat " << m->get_beat() << " from " << MSG_ADDR_NICE(m->get_source()) << " " << m->get_load() << endl;
+  
+  if (!mds->mdcache->get_root()) {
+    dout(10) << "no root on handle" << endl;
+    mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  int who = MSG_ADDR_NUM(m->get_source());
+  
+  if (who == 0) {
+    dout(20) << " from mds0, new epoch" << endl;
+    beat_epoch = m->get_beat();
+    send_heartbeat();
+
+    show_imports();
+  }
+  
+  mds_load[ who ] = m->get_load();
+  mds_import_map[ who ] = m->get_import_map();
+
+  //cout << "  load is " << load << " have " << mds_load.size() << endl;
+  
+  unsigned cluster_size = mds->get_mds_map()->get_num_mds();
+  if (mds_load.size() == cluster_size) {
+    // let's go!
+    //export_empties();  // no!
+    do_rebalance(m->get_beat());
+  }
+  
+  // done
+  delete m;
+}
+
+
+void MDBalancer::export_empties() 
+{
+  dout(5) << "export_empties checking for empty imports" << endl;
+
+  for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+       it != mds->mdcache->imports.end();
+       it++) {
+    CDir *dir = *it;
+    
+    if (!dir->inode->is_root() && dir->get_size() == 0) 
+      mds->mdcache->migrator->export_empty_import(dir);
+  }
+}
+
+
+
+double MDBalancer::try_match(int ex, double& maxex, 
+                             int im, double& maxim)
+{
+  if (maxex <= 0 || maxim <= 0) return 0.0;
+  
+  double howmuch = MIN(maxex, maxim);
+  if (howmuch <= 0) return 0.0;
+  
+  dout(5) << "   - mds" << ex << " exports " << howmuch << " to mds" << im << endl;
+  
+  if (ex == mds->get_nodeid())
+    my_targets[im] += howmuch;
+  
+  exported[ex] += howmuch;
+  imported[im] += howmuch;
+
+  maxex -= howmuch;
+  maxim -= howmuch;
+
+  return howmuch;
+}
+
+
+
+void MDBalancer::do_hashing()
+{
+  if (hash_queue.empty()) {
+    dout(20) << "do_hashing has nothing to do" << endl;
+    return;
+  }
+
+  dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl;
+  
+  for (set<inodeno_t>::iterator i = hash_queue.begin();
+       i != hash_queue.end();
+       i++) {
+    inodeno_t dirino = *i;
+    CInode *in = mds->mdcache->get_inode(dirino);
+    if (!in) continue;
+    CDir *dir = in->dir;
+    if (!dir) continue;
+    if (!dir->is_auth()) continue;
+
+    dout(0) << "do_hashing hashing " << *dir << endl;
+    mds->mdcache->migrator->hash_dir(dir);
+  }
+  hash_queue.clear();
+}
+
+
+
+void MDBalancer::do_rebalance(int beat)
+{
+  int cluster_size = mds->get_mds_map()->get_num_mds();
+  int whoami = mds->get_nodeid();
+
+  // reset
+  my_targets.clear();
+  imported.clear();
+  exported.clear();
+
+  dout(5) << " do_rebalance: cluster loads are" << endl;
+
+  // rescale!  turn my mds_load back into meta_load units
+  double load_fac = 1.0;
+  if (mds_load[whoami].mds_load() > 0) {
+    load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load();
+    dout(7) << " load_fac is " << load_fac 
+             << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load()
+             << endl;
+  }
+  
+  double total_load = 0;
+  multimap<double,int> load_map;
+  for (int i=0; i<cluster_size; i++) {
+    double l = mds_load[i].mds_load() * load_fac;
+    mds_meta_load[i] = l;
+
+    if (whoami == 0)
+      dout(-5) << "  mds" << i 
+               << " meta load " << mds_load[i] 
+               << " = " << mds_load[i].mds_load() 
+               << " --> " << l << endl;
+
+    if (whoami == i) my_load = l;
+    total_load += l;
+
+    load_map.insert(pair<double,int>( l, i ));
+  }
+
+  // target load
+  target_load = total_load / (double)cluster_size;
+  dout(5) << "do_rebalance:  my load " << my_load 
+          << "   target " << target_load 
+          << "   total " << total_load 
+          << endl;
+  
+  // under or over?
+  if (my_load < target_load) {
+    dout(5) << "  i am underloaded, doing nothing." << endl;
+    show_imports();
+    return;
+  }  
+
+  dout(5) << "  i am overloaded" << endl;
+
+
+  // first separate exporters and importers
+  multimap<double,int> importers;
+  multimap<double,int> exporters;
+  set<int>             importer_set;
+  set<int>             exporter_set;
+  
+  for (multimap<double,int>::iterator it = load_map.begin();
+       it != load_map.end();
+       it++) {
+    if (it->first < target_load) {
+      dout(15) << "   mds" << it->second << " is importer" << endl;
+      importers.insert(pair<double,int>(it->first,it->second));
+      importer_set.insert(it->second);
+    } else {
+      dout(15) << "   mds" << it->second << " is exporter" << endl;
+      exporters.insert(pair<double,int>(it->first,it->second));
+      exporter_set.insert(it->second);
+    }
+  }
+
+
+  // determine load transfer mapping
+
+  if (true) {
+    // analyze import_map; do any matches i can
+
+    dout(5) << "  matching exporters to import sources" << endl;
+
+    // big -> small exporters
+    for (multimap<double,int>::reverse_iterator ex = exporters.rbegin();
+         ex != exporters.rend();
+         ex++) {
+      double maxex = get_maxex(ex->second);
+      if (maxex <= .001) continue;
+      
+      // check importers. for now, just in arbitrary order (no intelligent matching).
+      for (map<int, float>::iterator im = mds_import_map[ex->second].begin();
+           im != mds_import_map[ex->second].end();
+           im++) {
+        double maxim = get_maxim(im->first);
+        if (maxim <= .001) continue;
+        try_match(ex->second, maxex,
+                  im->first, maxim);
+        if (maxex <= .001) break;;
+      }
+    }
+  }
+
+
+  if (1) {
+    if (beat % 2 == 1) {
+      // old way
+      dout(5) << "  matching big exporters to big importers" << endl;
+      // big exporters to big importers
+      multimap<double,int>::reverse_iterator ex = exporters.rbegin();
+      multimap<double,int>::iterator im = importers.begin();
+      while (ex != exporters.rend() &&
+             im != importers.end()) {
+        double maxex = get_maxex(ex->second);
+        double maxim = get_maxim(im->second);
+        if (maxex < .001 || maxim < .001) break;
+        try_match(ex->second, maxex,
+                  im->second, maxim);
+        if (maxex <= .001) ex++;
+        if (maxim <= .001) im++;
+      }
+    } else {
+      // new way
+      dout(5) << "  matching small exporters to big importers" << endl;
+      // small exporters to big importers
+      multimap<double,int>::iterator ex = exporters.begin();
+      multimap<double,int>::iterator im = importers.begin();
+      while (ex != exporters.end() &&
+             im != importers.end()) {
+        double maxex = get_maxex(ex->second);
+        double maxim = get_maxim(im->second);
+        if (maxex < .001 || maxim < .001) break;
+        try_match(ex->second, maxex,
+                  im->second, maxim);
+        if (maxex <= .001) ex++;
+        if (maxim <= .001) im++;
+      }
+    }
+  }
+
+
+
+  // make a sorted list of my imports
+  map<double,CDir*>    import_pop_map;
+  multimap<int,CDir*>  import_from_map;
+  for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+       it != mds->mdcache->imports.end();
+       it++) {
+    if ((*it)->is_hashed()) continue;
+    double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load();
+    if (pop < g_conf.mds_bal_idle_threshold &&
+        (*it)->inode != mds->mdcache->get_root()) {
+      dout(-5) << " exporting idle import " << **it 
+               << " back to mds" << (*it)->inode->authority()
+               << endl;
+      mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority());
+      continue;
+    }
+    import_pop_map[ pop ] = *it;
+    int from = (*it)->inode->authority();
+    dout(15) << "  map: i imported " << **it << " from " << from << endl;
+    import_from_map.insert(pair<int,CDir*>(from, *it));
+  }
+  
+
+
+  // do my exports!
+  set<CDir*> already_exporting;
+  double total_sent = 0;
+  double total_goal = 0;
+
+  for (map<int,double>::iterator it = my_targets.begin();
+       it != my_targets.end();
+       it++) {
+
+    /*
+    double fac = 1.0;
+    if (false && total_goal > 0 && total_sent > 0) {
+      fac = total_goal / total_sent;
+      dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl;
+      if (fac > 1.0) fac = 1.0;
+    }
+    fac = .9 - .4 * ((float)g_conf.num_mds / 128.0);  // hack magic fixme
+    */
+    
+    int target = (*it).first;
+    double amount = (*it).second;// * load_fac;
+    total_goal += amount;
+
+    if (amount < MIN_OFFLOAD) continue;
+
+    dout(-5) << " sending " << amount << " to mds" << target 
+      //<< " .. " << (*it).second << " * " << load_fac 
+            << " -> " << amount
+            << endl;//" .. fudge is " << fudge << endl;
+    double have = 0;
+    
+    show_imports();
+
+    // search imports from target
+    if (import_from_map.count(target)) {
+      dout(5) << " aha, looking through imports from target mds" << target << endl;
+      pair<multimap<int,CDir*>::iterator, multimap<int,CDir*>::iterator> p =
+        import_from_map.equal_range(target);
+      while (p.first != p.second) {
+        CDir *dir = (*p.first).second;
+        dout(5) << "considering " << *dir << " from " << (*p.first).first << endl;
+        multimap<int,CDir*>::iterator plast = p.first++;
+        
+        if (dir->inode->is_root()) continue;
+        if (dir->is_hashed()) continue;
+        if (dir->is_freezing() || dir->is_frozen()) continue;  // export pbly already in progress
+        double pop = dir->popularity[MDS_POP_CURDOM].meta_load();
+        assert(dir->inode->authority() == target);  // cuz that's how i put it in the map, dummy
+        
+        if (pop <= amount-have) {
+          dout(-5) << "reexporting " << *dir 
+                   << " pop " << pop 
+                   << " back to mds" << target << endl;
+          mds->mdcache->migrator->export_dir(dir, target);
+          have += pop;
+          import_from_map.erase(plast);
+          import_pop_map.erase(pop);
+        } else {
+          dout(5) << "can't reexport " << *dir << ", too big " << pop << endl;
+        }
+        if (amount-have < MIN_OFFLOAD) break;
+      }
+    }
+    if (amount-have < MIN_OFFLOAD) {
+      total_sent += have;
+      continue;
+    }
+    
+    // any other imports
+    if (false)
+    for (map<double,CDir*>::iterator import = import_pop_map.begin();
+         import != import_pop_map.end();
+         import++) {
+      CDir *imp = (*import).second;
+      if (imp->inode->is_root()) continue;
+      
+      double pop = (*import).first;
+      if (pop < amount-have || pop < MIN_REEXPORT) {
+        dout(-5) << "reexporting " << *imp 
+                 << " pop " << pop 
+                 << " back to mds" << imp->inode->authority()
+                 << endl;
+        have += pop;
+        mds->mdcache->migrator->export_dir(imp, imp->inode->authority());
+      }
+      if (amount-have < MIN_OFFLOAD) break;
+    }
+    if (amount-have < MIN_OFFLOAD) {
+      //fudge = amount-have;
+      total_sent += have;
+      continue;
+    }
+
+    // okay, search for fragments of my workload
+    set<CDir*> candidates = mds->mdcache->imports;
+
+    list<CDir*> exports;
+    
+    for (set<CDir*>::iterator pot = candidates.begin();
+         pot != candidates.end();
+         pot++) {
+      find_exports(*pot, amount, exports, have, already_exporting);
+      if (have > amount-MIN_OFFLOAD) {
+        break;
+      }
+    }
+    //fudge = amount - have;
+    total_sent += have;
+    
+    for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); it++) {
+      dout(-5) << " exporting to mds" << target 
+               << " fragment " << **it 
+               << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() 
+               << endl;
+      mds->mdcache->migrator->export_dir(*it, target);
+    }
+  }
+
+  dout(5) << "rebalance done" << endl;
+  show_imports();
+  
+}
+
+
+
+void MDBalancer::find_exports(CDir *dir, 
+                              double amount, 
+                              list<CDir*>& exports, 
+                              double& have,
+                              set<CDir*>& already_exporting)
+{
+  double need = amount - have;
+  if (need < amount * g_conf.mds_bal_min_start)
+    return;   // good enough!
+  double needmax = need * g_conf.mds_bal_need_max;
+  double needmin = need * g_conf.mds_bal_need_min;
+  double midchunk = need * g_conf.mds_bal_midchunk;
+  double minchunk = need * g_conf.mds_bal_minchunk;
+
+  list<CDir*> bigger;
+  multimap<double, CDir*> smaller;
+
+  double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load();
+  double dir_sum = 0;
+  dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl;
+
+  for (CDir_map_t::iterator it = dir->begin();
+       it != dir->end();
+       it++) {
+    CInode *in = it->second->get_inode();
+    if (!in) continue;
+    if (!in->is_dir()) continue;
+    if (!in->dir) continue;       // clearly not popular
+    
+    if (in->dir->is_export()) continue;
+    if (in->dir->is_hashed()) continue;
+    if (already_exporting.count(in->dir)) continue;
+
+    if (in->dir->is_frozen()) continue;  // can't export this right now!
+    //if (in->dir->get_size() == 0) continue;  // don't export empty dirs, even if they're not complete.  for now!
+    
+    // how popular?
+    double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load();
+    dir_sum += pop;
+    dout(20) << "   pop " << pop << " " << *in->dir << endl;
+
+    if (pop < minchunk) continue;
+
+    // lucky find?
+    if (pop > needmin && pop < needmax) {
+      exports.push_back(in->dir);
+      have += pop;
+      return;
+    }
+    
+    if (pop > need)
+      bigger.push_back(in->dir);
+    else
+      smaller.insert(pair<double,CDir*>(pop, in->dir));
+  }
+  dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl;
+
+  // grab some sufficiently big small items
+  multimap<double,CDir*>::reverse_iterator it;
+  for (it = smaller.rbegin();
+       it != smaller.rend();
+       it++) {
+
+    if ((*it).first < midchunk)
+      break;  // try later
+    
+    dout(7) << " taking smaller " << *(*it).second << endl;
+    
+    exports.push_back((*it).second);
+    already_exporting.insert((*it).second);
+    have += (*it).first;
+    if (have > needmin)
+      return;
+  }
+  
+  // apprently not enough; drill deeper into the hierarchy (if non-replicated)
+  for (list<CDir*>::iterator it = bigger.begin();
+       it != bigger.end();
+       it++) {
+    if ((*it)->is_rep()) continue;
+    dout(7) << " descending into " << **it << endl;
+    find_exports(*it, amount, exports, have, already_exporting);
+    if (have > needmin)
+      return;
+  }
+
+  // ok fine, use smaller bits
+  for (;
+       it != smaller.rend();
+       it++) {
+
+    dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl;
+
+    exports.push_back((*it).second);
+    already_exporting.insert((*it).second);
+    have += (*it).first;
+    if (have > needmin)
+      return;
+  }
+
+  // ok fine, drill inot replicated dirs
+  for (list<CDir*>::iterator it = bigger.begin();
+       it != bigger.end();
+       it++) {
+    if (!(*it)->is_rep()) continue;
+    dout(7) << " descending into replicated " << **it << endl;
+    find_exports(*it, amount, exports, have, already_exporting);
+    if (have > needmin)
+      return;
+  }
+
+}
+
+
+
+
+void MDBalancer::hit_inode(CInode *in, int type)
+{
+  // hit me
+  in->popularity[MDS_POP_JUSTME].pop[type].hit();
+  in->popularity[MDS_POP_NESTED].pop[type].hit();
+  if (in->is_auth()) {
+    in->popularity[MDS_POP_CURDOM].pop[type].hit();
+    in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+  }
+  
+  // hit auth up to import
+  CDir *dir = in->get_parent_dir();
+  if (dir) hit_dir(dir, type);
+}
+
+
+void MDBalancer::hit_dir(CDir *dir, int type) 
+{
+  // hit me
+  float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit();
+
+  // hit modify counter, if this was a modify
+  if (g_conf.num_mds > 2 &&             // FIXME >2 thing
+      !dir->inode->is_root() &&        // not root (for now at least)
+      dir->is_auth()) {
+    //dout(-20) << "hit_dir " << type << " pop is " << v << "  " << *dir << endl;
+
+    // hash this dir?  (later?)
+    if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) ||
+         //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) ||
+         (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) &&
+        !(dir->is_hashed() || dir->is_hashing()) &&
+        hash_queue.count(dir->ino()) == 0) {
+      dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl;
+      hash_queue.insert(dir->ino());
+    }
+
+  }
+  
+  hit_recursive(dir, type);
+}
+
+
+
+void MDBalancer::hit_recursive(CDir *dir, int type)
+{
+  bool anydom = dir->is_auth();
+  bool curdom = dir->is_auth();
+
+  float rd_adj = 0.0;
+
+  // replicate?
+  float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get();    // hmm??
+
+  if (dir->is_auth()) {
+    if (!dir->is_rep() &&
+        dir_pop >= g_conf.mds_bal_replicate_threshold) {
+      // replicate
+      float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get();
+      rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; 
+      rd_adj /= 2.0;  // temper somewhat
+
+      dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
+          
+      dir->dir_rep = CDIR_REP_ALL;
+      mds->mdcache->send_dir_updates(dir, true);
+
+      dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj);
+      dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj);
+    }
+        
+    if (!dir->ino() != 1 &&
+        dir->is_rep() &&
+        dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+      // unreplicate
+      dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+      
+      dir->dir_rep = CDIR_REP_NONE;
+      mds->mdcache->send_dir_updates(dir);
+    }
+  }
+
+
+  while (dir) {
+    CInode *in = dir->inode;
+
+    dir->popularity[MDS_POP_NESTED].pop[type].hit();
+    in->popularity[MDS_POP_NESTED].pop[type].hit();
+    
+    if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj);
+
+    if (anydom) {
+      dir->popularity[MDS_POP_ANYDOM].pop[type].hit();
+      in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+    }
+    
+    if (curdom) {
+      dir->popularity[MDS_POP_CURDOM].pop[type].hit();
+      in->popularity[MDS_POP_CURDOM].pop[type].hit();
+    }
+    
+    if (dir->is_import()) 
+      curdom = false;   // end of auth domain, stop hitting auth counters.
+    dir = dir->inode->get_parent_dir();
+  }
+}
+
+
+/*
+ * subtract off an exported chunk
+ */
+void MDBalancer::subtract_export(CDir *dir)
+{
+  meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
+  
+  bool in_domain = !dir->is_import();
+  
+  while (true) {
+    CInode *in = dir->inode;
+    
+    in->popularity[MDS_POP_ANYDOM] -= curdom;
+    if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom;
+    
+    dir = in->get_parent_dir();
+    if (!dir) break;
+    
+    if (dir->is_import()) in_domain = false;
+    
+    dir->popularity[MDS_POP_ANYDOM] -= curdom;
+    if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom;
+  }
+}
+    
+
+void MDBalancer::add_import(CDir *dir)
+{
+  meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
+
+  bool in_domain = !dir->is_import();
+  
+  while (true) {
+    CInode *in = dir->inode;
+    
+    in->popularity[MDS_POP_ANYDOM] += curdom;
+    if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom;
+    
+    dir = in->get_parent_dir();
+    if (!dir) break;
+    
+    if (dir->is_import()) in_domain = false;
+    
+    dir->popularity[MDS_POP_ANYDOM] += curdom;
+    if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom;
+  }
+ 
+}
+
+
+
+
+
+
+void MDBalancer::show_imports(bool external)
+{
+  int db = 20; //debug level
+  return;
+  
+  if (mds->mdcache->imports.empty() &&
+      mds->mdcache->hashdirs.empty()) {
+    dout(db) << "no imports/exports/hashdirs" << endl;
+    return;
+  }
+  dout(db) << "imports/exports/hashdirs:" << endl;
+
+  set<CDir*> ecopy = mds->mdcache->exports;
+
+  set<CDir*>::iterator it = mds->mdcache->hashdirs.begin();
+  while (1) {
+    if (it == mds->mdcache->hashdirs.end()) it = mds->mdcache->imports.begin();
+    if (it == mds->mdcache->imports.end() ) break;
+    
+    CDir *im = *it;
+    
+    if (im->is_import()) {
+      dout(db) << "  + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ")  " << *im << endl;
+      assert( im->is_auth() );
+    } 
+    else if (im->is_hashed()) {
+      if (im->is_import()) continue;  // if import AND hash, list as import.
+      dout(db) << "  + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ")  " << *im << endl;
+    }
+    
+    for (set<CDir*>::iterator p = mds->mdcache->nested_exports[im].begin();
+         p != mds->mdcache->nested_exports[im].end();
+         p++) {
+      CDir *exp = *p;
+      if (exp->is_hashed()) {
+        //assert(0);  // we don't do it this way actually
+        dout(db) << "      - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ")  " << *exp << " to " << exp->dir_auth << endl;
+        assert( !exp->is_auth() );
+      } else {
+        dout(db) << "      - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ")  " << *exp << " to " << exp->dir_auth << endl;
+        assert( exp->is_export() );
+        assert( !exp->is_auth() );
+      }
+
+      if ( mds->mdcache->get_auth_container(exp) != im ) {
+        dout(1) << "uh oh, auth container is " << mds->mdcache->get_auth_container(exp) << endl;
+        dout(1) << "uh oh, auth container is " << *mds->mdcache->get_auth_container(exp) << endl;
+        assert( mds->mdcache->get_auth_container(exp) == im );
+      }
+      
+      if (ecopy.count(exp) != 1) {
+        dout(1) << "***** nested_export " << *exp << " not in exports" << endl;
+        assert(0);
+      }
+      ecopy.erase(exp);
+    }
+
+    it++;
+  }
+  
+  if (ecopy.size()) {
+    for (set<CDir*>::iterator it = ecopy.begin();
+         it != ecopy.end();
+         it++) 
+      dout(1) << "***** stray item in exports: " << **it << endl;
+    assert(ecopy.size() == 0);
+  }
+}
+
+
+
+/*  replicate?
+
+      float dir_pop = dir->get_popularity();
+      
+      if (dir->is_auth()) {
+        if (!dir->is_rep() &&
+            dir_pop >= g_conf.mds_bal_replicate_threshold) {
+          // replicate
+          dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl;
+          
+          dir->dir_rep = CDIR_REP_ALL;
+          mds->mdcache->send_dir_updates(dir);
+        }
+        
+        if (dir->is_rep() &&
+            dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+          // unreplicate
+          dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl;
+          
+          dir->dir_rep = CDIR_REP_NONE;
+          mds->mdcache->send_dir_updates(dir);
+        }
+      }
+
+*/
diff --git a/branches/sage/cephmds2/mds/MDBalancer.h b/branches/sage/cephmds2/mds/MDBalancer.h
new file mode 100644
index 0000000000000..a6129045ca3f7
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDBalancer.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MDBALANCER_H
+#define __MDBALANCER_H
+
+#include <ostream>
+#include <list>
+using namespace std;
+
+#include <map>
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "CInode.h"
+
+
+class MDS;
+class Message;
+class MHeartbeat;
+class CInode;
+class Context;
+class CDir;
+
+class MDBalancer {
+ protected:
+  MDS *mds;
+  
+  int beat_epoch;
+
+  // todo
+  set<inodeno_t>   hash_queue;
+
+  // per-epoch scatter/gathered info
+  hash_map<int, mds_load_t>  mds_load;
+  hash_map<int, float>       mds_meta_load;
+  map<int, map<int, float> > mds_import_map;
+
+  // per-epoch state
+  double          my_load, target_load;
+  map<int,double> my_targets;
+  map<int,double> imported;
+  map<int,double> exported;
+
+  double try_match(int ex, double& maxex,
+                   int im, double& maxim);
+  double get_maxim(int im) {
+    return target_load - mds_meta_load[im] - imported[im];
+  }
+  double get_maxex(int ex) {
+    return mds_meta_load[ex] - target_load - exported[ex];    
+  }
+
+ public:
+  MDBalancer(MDS *m) {
+    mds = m;
+    beat_epoch = 0;
+  }
+  
+  mds_load_t get_load();
+
+  int proc_message(Message *m);
+  
+  void send_heartbeat();
+  void handle_heartbeat(MHeartbeat *m);
+
+  void do_hashing();
+
+  void export_empties();
+  void do_rebalance(int beat);
+  void find_exports(CDir *dir, 
+                    double amount, 
+                    list<CDir*>& exports, 
+                    double& have,
+                    set<CDir*>& already_exporting);
+
+
+  void subtract_export(class CDir *ex);
+  void add_import(class CDir *im);
+
+  void hit_inode(class CInode *in, int type=0);
+  void hit_dir(class CDir *dir, int type=0);
+  void hit_recursive(class CDir *dir, int type=0);
+
+
+  void show_imports(bool external=false);
+
+};
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc
new file mode 100644
index 0000000000000..02e2a9cd1417d
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDCache.cc
@@ -0,0 +1,2580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "MDCache.h"
+#include "MDStore.h"
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "AnchorClient.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "MDSMap.h"
+
+#include "CInode.h"
+#include "CDir.h"
+
+#include "include/filepath.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "common/Logger.h"
+
+#include "osdc/Filer.h"
+
+#include "events/EUnlink.h"
+#include "events/EPurgeFinish.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MDirUpdate.h"
+#include "messages/MCacheExpire.h"
+
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+#include "messages/MInodeUnlink.h"
+#include "messages/MInodeUnlinkAck.h"
+
+#include "messages/MLock.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientFileCaps.h"
+
+#include "IdAllocator.h"
+
+#include "common/Timer.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <iostream>
+#include <string>
+#include <map>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache "
+
+
+
+
+MDCache::MDCache(MDS *m)
+{
+  mds = m;
+  migrator = new Migrator(mds, this);
+  renamer = new Renamer(mds, this);
+  root = NULL;
+  lru.lru_set_max(g_conf.mds_cache_size);
+  lru.lru_set_midpoint(g_conf.mds_cache_mid);
+
+  did_shutdown_exports = false;
+  shutdown_commits = 0;
+}
+
+MDCache::~MDCache() 
+{
+  delete migrator;
+  delete renamer;
+}
+
+
+void MDCache::log_stat(Logger *logger)
+{
+  if (get_root()) {
+    logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load());
+    logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load());
+  }
+  logger->set("c", lru.lru_get_size());
+  logger->set("cpin", lru.lru_get_num_pinned());
+  logger->set("ctop", lru.lru_get_top());
+  logger->set("cbot", lru.lru_get_bot());
+  logger->set("cptail", lru.lru_get_pintail());
+}
+
+
+// 
+
+bool MDCache::shutdown()
+{
+  if (lru.lru_get_size() > 0) {
+    dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl;
+    //show_cache();
+    show_imports();
+    //dump();
+  }
+  return true;
+}
+
+
+// MDCache
+
+CInode *MDCache::create_inode()
+{
+  CInode *in = new CInode(this);
+
+  // zero
+  memset(&in->inode, 0, sizeof(inode_t));
+  
+  // assign ino
+  in->inode.ino = mds->idalloc->alloc_id();
+
+  in->inode.nlink = 1;   // FIXME
+
+  in->inode.layout = g_OSD_FileLayout;
+
+  add_inode(in);  // add
+  return in;
+}
+
+void MDCache::destroy_inode(CInode *in)
+{
+  mds->idalloc->reclaim_id(in->ino());
+  remove_inode(in);
+}
+
+
+void MDCache::add_inode(CInode *in) 
+{
+  // add to lru, inode map
+  assert(inode_map.size() == lru.lru_get_size());
+  lru.lru_insert_mid(in);
+  assert(inode_map.count(in->ino()) == 0);  // should be no dup inos!
+  inode_map[ in->ino() ] = in;
+  assert(inode_map.size() == lru.lru_get_size());
+}
+
+void MDCache::remove_inode(CInode *o) 
+{ 
+  dout(14) << "remove_inode " << *o << endl;
+  if (o->get_parent_dn()) {
+    // FIXME: multiple parents?
+    CDentry *dn = o->get_parent_dn();
+    assert(!dn->is_dirty());
+    if (dn->is_sync())
+      dn->dir->remove_dentry(dn);  // unlink inode AND hose dentry
+    else
+      dn->dir->unlink_inode(dn);   // leave dentry
+  }
+  inode_map.erase(o->ino());    // remove from map
+  lru.lru_remove(o);           // remove from lru
+}
+
+
+
+
+void MDCache::rename_file(CDentry *srcdn, 
+                          CDentry *destdn)
+{
+  CInode *in = srcdn->inode;
+
+  // unlink src
+  srcdn->dir->unlink_inode(srcdn);
+  
+  // unlink old inode?
+  if (destdn->inode) destdn->dir->unlink_inode(destdn);
+  
+  // link inode w/ dentry
+  destdn->dir->link_inode( destdn, in );
+}
+
+
+
+void MDCache::set_root(CInode *in)
+{
+  assert(root == 0);
+  root = in;
+  root->state_set(CINODE_STATE_ROOT);
+}
+
+void MDCache::add_import(CDir *dir)
+{
+  imports.insert(dir);
+  dir->state_set(CDIR_STATE_IMPORT);
+  dir->get(CDIR_PIN_IMPORT);
+}
+
+
+
+
+
+// **************
+// Inode purging -- reliably removing deleted file's objects
+
+class C_MDC_PurgeFinish : public Context {
+  MDCache *mdc;
+  inodeno_t ino;
+public:
+  C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
+  void finish(int r) {
+    mdc->purge_inode_finish(ino);
+  }
+};
+class C_MDC_PurgeFinish2 : public Context {
+  MDCache *mdc;
+  inodeno_t ino;
+public:
+  C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
+  void finish(int r) {
+    mdc->purge_inode_finish_2(ino);
+  }
+};
+
+/* purge_inode in
+ * will be called by on unlink or rmdir
+ * caller responsible for journaling an appropriate EUnlink or ERmdir
+ */
+void MDCache::purge_inode(inode_t &inode)
+{
+  dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl;
+
+  // take note
+  assert(purging.count(inode.ino) == 0);
+  purging[inode.ino] = inode;
+
+  // remove
+  mds->filer->remove(inode, 0, inode.size,
+		     0, new C_MDC_PurgeFinish(this, inode.ino));
+}
+
+void MDCache::purge_inode_finish(inodeno_t ino)
+{
+  dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl;
+
+  // log completion
+  mds->mdlog->submit_entry(new EPurgeFinish(ino),
+			   new C_MDC_PurgeFinish2(this, ino));
+}
+
+void MDCache::purge_inode_finish_2(inodeno_t ino)
+{
+  dout(10) << "purge_inode_finish_2 " << ino << endl;
+
+  // remove from purging list
+  purging.erase(ino);
+
+  // tell anyone who cares (log flusher?)
+  list<Context*> ls;
+  ls.swap(waiting_for_purge[ino]);
+  waiting_for_purge.erase(ino);
+  finish_contexts(ls, 0);
+
+  // reclaim ino?
+  
+}
+
+void MDCache::start_recovered_purges()
+{
+  for (map<inodeno_t,inode_t>::iterator p = purging.begin();
+       p != purging.end();
+       ++p) {
+    dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl;
+    mds->filer->remove(p->second, 0, p->second.size,
+		       0, new C_MDC_PurgeFinish(this, p->first));
+  }
+}
+
+
+
+
+bool MDCache::trim(int max) 
+{
+  // empty?  short cut.
+  if (lru.lru_get_size() == 0) return true;
+
+  if (max < 0) {
+    max = lru.lru_get_max();
+    if (!max) return false;
+  }
+
+  map<int, MCacheExpire*> expiremap;
+
+  dout(7) << "trim max=" << max << "  cur=" << lru.lru_get_size() << endl;
+  assert(expiremap.empty());
+
+  while (lru.lru_get_size() > (unsigned)max) {
+    CInode *in = (CInode*)lru.lru_expire();
+    if (!in) break; //return false;
+
+    if (in->dir) {
+      // notify dir authority?
+      int auth = in->dir->authority();
+      if (auth != mds->get_nodeid()) {
+        dout(17) << "sending expire to mds" << auth << " on   " << *in->dir << endl;
+        if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+        expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce);
+      }
+    }
+
+    // notify inode authority?
+    {
+      int auth = in->authority();
+      if (auth != mds->get_nodeid()) {
+        assert(!in->is_auth());
+        dout(17) << "sending expire to mds" << auth << " on " << *in << endl;
+        if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+        expiremap[auth]->add_inode(in->ino(), in->replica_nonce);
+      }    else {
+        assert(in->is_auth());
+      }
+    }
+    CInode *diri = NULL;
+    if (in->parent)
+      diri = in->parent->dir->inode;
+
+    if (in->is_root()) {
+      dout(7) << "just trimmed root, cache now empty." << endl;
+      root = NULL;
+    }
+
+
+    // last link?
+    if (in->inode.nlink == 0) {
+      dout(17) << "last link, removing file content " << *in << endl;             // FIXME THIS IS WRONG PLACE FOR THIS!
+      mds->filer->zero(in->inode, 
+                       0, in->inode.size, 
+                       NULL, NULL);   // FIXME
+    }
+
+    // remove it
+    dout(15) << "trim removing " << *in << " " << in << endl;
+    remove_inode(in);
+    delete in;
+
+    if (diri) {
+      // dir incomplete!
+      diri->dir->state_clear(CDIR_STATE_COMPLETE);
+
+      // reexport?
+      if (diri->dir->is_import() &&             // import
+          diri->dir->get_size() == 0 &&         // no children
+          !diri->is_root())                   // not root
+        migrator->export_empty_import(diri->dir);
+      
+    } 
+
+    mds->logger->inc("cex");
+  }
+
+
+  /* hack
+  if (lru.lru_get_size() == max) {
+    int i;
+    dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl;
+    CInode *cur = (CInode*)lru.lru_tophead;
+    i = 1;
+    while (cur) {
+      dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << "  " << *cur << endl;
+      cur = (CInode*)cur->lru_next;
+    }
+
+    dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl;
+    cur = (CInode*)lru.lru_bothead;
+    i = 1;
+    while (cur) {
+      dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << "  " << *cur << endl;
+      cur = (CInode*)cur->lru_next;
+    }
+
+  }
+  */
+
+  // send expires
+  for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
+       it != expiremap.end();
+       it++) {
+    dout(7) << "sending cache_expire to " << it->first << endl;
+    mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
+  }
+
+
+  return true;
+}
+
+class C_MDC_ShutdownCommit : public Context {
+  MDCache *mdc;
+public:
+  C_MDC_ShutdownCommit(MDCache *mdc) {
+    this->mdc = mdc;
+  }
+  void finish(int r) {
+    mdc->shutdown_commits--;
+  }
+};
+
+class C_MDC_ShutdownCheck : public Context {
+  MDCache *mdc;
+  Mutex *lock;
+public:
+  C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {}
+  void finish(int) {
+    lock->Lock();
+    mdc->shutdown_check();
+    lock->Unlock();
+  }
+};
+
+void MDCache::shutdown_check()
+{
+  dout(0) << "shutdown_check at " << g_clock.now() << endl;
+
+  // cache
+  int o = g_conf.debug_mds;
+  g_conf.debug_mds = 10;
+  show_cache();
+  g_conf.debug_mds = o;
+  g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+
+  // this
+  dout(0) << "lru size now " << lru.lru_get_size() << endl;
+  dout(0) << "log len " << mds->mdlog->get_num_events() << endl;
+
+
+  if (exports.size()) 
+    dout(0) << "still have " << exports.size() << " exports" << endl;
+
+  if (mds->filer->is_active()) 
+    dout(0) << "filer still active" << endl;
+}
+
+void MDCache::shutdown_start()
+{
+  dout(1) << "shutdown_start" << endl;
+
+  if (g_conf.mds_shutdown_check)
+    g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+}
+
+
+
+bool MDCache::shutdown_pass()
+{
+  dout(7) << "shutdown_pass" << endl;
+  //assert(mds->is_shutting_down());
+  if (mds->is_stopped()) {
+    dout(7) << " already shut down" << endl;
+    show_cache();
+    show_imports();
+    return true;
+  }
+
+  // unhash dirs?
+  if (!hashdirs.empty()) {
+    // unhash any of my dirs?
+    for (set<CDir*>::iterator it = hashdirs.begin();
+         it != hashdirs.end();
+         it++) {
+      CDir *dir = *it;
+      if (!dir->is_auth()) continue;
+      if (dir->is_unhashing()) continue;
+      migrator->unhash_dir(dir);
+    }
+
+    dout(7) << "waiting for dirs to unhash" << endl;
+    return false;
+  }
+
+  // commit dirs?
+  if (g_conf.mds_commit_on_shutdown) {
+    
+    if (shutdown_commits < 0) {
+      dout(1) << "shutdown_pass committing all dirty dirs" << endl;
+      shutdown_commits = 0;
+      
+      for (hash_map<inodeno_t, CInode*>::iterator it = inode_map.begin();
+           it != inode_map.end();
+           it++) {
+        CInode *in = it->second;
+        
+        // commit any dirty dir that's ours
+        if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) {
+          mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this));
+          shutdown_commits++;
+        }
+      }
+    }
+
+    // commits?
+    if (shutdown_commits > 0) {
+      dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl;
+      return false;
+    }
+  }
+
+  // flush anything we can from the cache
+  trim(0);
+  dout(5) << "cache size now " << lru.lru_get_size() << endl;
+
+
+  // (wait for) flush log?
+  if (g_conf.mds_log_flush_on_shutdown &&
+      mds->mdlog->get_num_events()) {
+    dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl;
+    return false;
+  } 
+  
+  // send all imports back to 0.
+  if (mds->get_nodeid() != 0 && !did_shutdown_exports) {
+    // flush what i can from the cache first..
+    trim(0);
+
+    // export to root
+    for (set<CDir*>::iterator it = imports.begin();
+         it != imports.end();
+         ) {
+      CDir *im = *it;
+      it++;
+      if (im->inode->is_root()) continue;
+      if (im->is_frozen() || im->is_freezing()) continue;
+      
+      dout(7) << "sending " << *im << " back to mds0" << endl;
+      migrator->export_dir(im,0);
+    }
+    did_shutdown_exports = true;
+  } 
+
+
+  // waiting for imports?  (e.g. root?)
+  if (exports.size()) {
+    dout(7) << "still have " << exports.size() << " exports" << endl;
+    //show_cache();
+    return false;
+  }
+
+  // filer active?
+  if (mds->filer->is_active()) {
+    dout(7) << "filer still active" << endl;
+    return false;
+  }
+  
+  // close root?
+  if (mds->get_nodeid() == 0 &&
+      lru.lru_get_size() == 1 &&
+      root && 
+      root->dir && 
+      root->dir->is_import() &&
+      root->dir->get_ref() == 1) {  // 1 is the import!
+    // un-import
+    dout(7) << "removing root import" << endl;
+    imports.erase(root->dir);
+    root->dir->state_clear(CDIR_STATE_IMPORT);
+    root->dir->put(CDIR_PIN_IMPORT);
+
+    if (root->is_pinned_by(CINODE_PIN_DIRTY)) {
+      dout(7) << "clearing root dirty flag" << endl;
+      root->put(CINODE_PIN_DIRTY);
+    }
+
+    trim(0);
+    assert(inode_map.size() == lru.lru_get_size());
+  }
+  
+  // imports?
+  if (!imports.empty()) {
+    dout(7) << "still have " << imports.size() << " imports" << endl;
+    show_cache();
+    return false;
+  }
+  
+  // done?
+  if (lru.lru_get_size() > 0) {
+    dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl;
+    show_cache();
+    //dump();
+    return false;
+  } 
+  
+  // done!
+  dout(1) << "shutdown done." << endl;
+  return true;
+}
+
+
+
+
+
+
+
+int MDCache::open_root(Context *c)
+{
+  int whoami = mds->get_nodeid();
+
+  // open root inode
+  if (whoami == 0) { 
+    // i am root inode
+    CInode *root = new CInode(this);
+    memset(&root->inode, 0, sizeof(inode_t));
+    root->inode.ino = 1;
+    root->inode.hash_seed = 0;   // not hashed!
+
+    // make it up (FIXME)
+    root->inode.mode = 0755 | INODE_MODE_DIR;
+    root->inode.size = 0;
+    root->inode.ctime = 0;
+    root->inode.mtime = g_clock.gettime();
+
+    root->inode.nlink = 1;
+    root->inode.layout = g_OSD_MDDirLayout;
+
+    set_root( root );
+    add_inode( root );
+
+    // root directory too
+    assert(root->dir == NULL);
+    root->set_dir( new CDir(root, mds, true) );
+    root->dir->set_dir_auth( 0 );  // me!
+    root->dir->dir_rep = CDIR_REP_ALL;   //NONE;
+
+    // root is sort of technically an import (from a vacuum)
+    imports.insert( root->dir );
+    root->dir->state_set(CDIR_STATE_IMPORT);
+    root->dir->get(CDIR_PIN_IMPORT);
+
+    if (c) {
+      c->finish(0);
+      delete c;
+    }
+  } else {
+    // request inode from root mds
+    if (waiting_for_root.empty()) {
+      dout(7) << "discovering root" << endl;
+
+      filepath want;
+      MDiscover *req = new MDiscover(whoami,
+                                     0,
+                                     want,
+                                     false);  // there _is_ no base dir for the root inode
+      mds->send_message_mds(req, 0, MDS_PORT_CACHE);
+    } else {
+      dout(7) << "waiting for root" << endl;
+    }    
+
+    // wait
+    waiting_for_root.push_back(c);
+
+  }
+
+  return 0;
+}
+
+
+
+
+
+
+
+
+// ========= messaging ==============
+
+
+void MDCache::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_MDS_DISCOVER:
+    handle_discover((MDiscover*)m);
+    break;
+  case MSG_MDS_DISCOVERREPLY:
+    handle_discover_reply((MDiscoverReply*)m);
+    break;
+
+    /*
+  case MSG_MDS_INODEUPDATE:
+    handle_inode_update((MInodeUpdate*)m);
+    break;
+    */
+
+  case MSG_MDS_INODELINK:
+    handle_inode_link((MInodeLink*)m);
+    break;
+  case MSG_MDS_INODELINKACK:
+    handle_inode_link_ack((MInodeLinkAck*)m);
+    break;
+
+  case MSG_MDS_DIRUPDATE:
+    handle_dir_update((MDirUpdate*)m);
+    break;
+
+  case MSG_MDS_CACHEEXPIRE:
+    handle_cache_expire((MCacheExpire*)m);
+    break;
+
+
+
+  case MSG_MDS_DENTRYUNLINK:
+    handle_dentry_unlink((MDentryUnlink*)m);
+    break;
+
+
+    
+
+    
+  default:
+    dout(7) << "cache unknown message " << m->get_type() << endl;
+    assert(0);
+    break;
+  }
+}
+
+
+/* path_traverse
+ *
+ * return values:
+ *   <0 : traverse error (ENOTDIR, ENOENT)
+ *    0 : success
+ *   >0 : delayed or forwarded
+ *
+ * Notes:
+ *   onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_
+ *   you aren't absolutely certain that the path actually exists.  If it doesn't,
+ *   the context is needed to pass a (failure) result code.
+ */
+
+class C_MDC_TraverseDiscover : public Context {
+  Context *onfinish, *ondelay;
+ public:
+  C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) {
+    this->ondelay = ondelay;
+    this->onfinish = onfinish;
+  }
+  void finish(int r) {
+    //dout(10) << "TraverseDiscover r = " << r << endl;
+    if (r < 0 && onfinish) {   // ENOENT on discover, pass back to caller.
+      onfinish->finish(r);
+    } else {
+      ondelay->finish(r);      // retry as usual
+    }
+    delete onfinish;
+    delete ondelay;
+  }
+};
+
+int MDCache::path_traverse(filepath& origpath, 
+                           vector<CDentry*>& trace, 
+                           bool follow_trailing_symlink,
+                           Message *req,
+                           Context *ondelay,
+                           int onfail,
+                           Context *onfinish,
+                           bool is_client_req)  // true if req is MClientRequest .. gross, FIXME
+{
+  int whoami = mds->get_nodeid();
+  set< pair<CInode*, string> > symlinks_resolved; // keep a list of symlinks we touch to avoid loops
+
+  bool noperm = false;
+  if (onfail == MDS_TRAVERSE_DISCOVER ||
+      onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true;
+
+  // root
+  CInode *cur = get_root();
+  if (cur == NULL) {
+    dout(7) << "traverse: i don't have root" << endl;
+    open_root(ondelay);
+    if (onfinish) delete onfinish;
+    return 1;
+  }
+
+  // start trace
+  trace.clear();
+
+  // make our own copy, since we'll modify when we hit symlinks
+  filepath path = origpath;  
+
+  unsigned depth = 0;
+  while (depth < path.depth()) {
+    dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl;
+    
+    // ENOTDIR?
+    if (!cur->is_dir()) {
+      dout(7) << "traverse: " << *cur << " not a dir " << endl;
+      delete ondelay;
+      if (onfinish) {
+        onfinish->finish(-ENOTDIR);
+        delete onfinish;
+      }
+      return -ENOTDIR;
+    }
+
+    // open dir
+    if (!cur->dir) {
+      if (cur->dir_is_auth()) {
+        // parent dir frozen_dir?
+        if (cur->is_frozen_dir()) {
+          dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl;
+          cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
+          if (onfinish) delete onfinish;
+          return 1;
+        }
+
+        cur->get_or_open_dir(mds);
+        assert(cur->dir);
+      } else {
+        // discover dir from/via inode auth
+        assert(!cur->is_auth());
+        if (cur->waiting_for(CINODE_WAIT_DIR)) {
+          dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl;
+        } else {
+          filepath want = path.postfixpath(depth);
+          dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl;
+          mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+					      cur->ino(),
+					      want,
+					      true),  // need this dir too
+				cur->authority(), MDS_PORT_CACHE);
+        }
+        cur->add_waiter(CINODE_WAIT_DIR, ondelay);
+        if (onfinish) delete onfinish;
+        return 1;
+      }
+    }
+    
+    // frozen?
+    /*
+    if (cur->dir->is_frozen()) {
+      // doh!
+      // FIXME: traverse is allowed?
+      dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl;
+      cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
+      if (onfinish) delete onfinish;
+      return 1;
+    }
+    */
+
+    // must read directory hard data (permissions, x bit) to traverse
+    if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) {
+      if (onfinish) delete onfinish;
+      return 1;
+    }
+    
+    // check permissions?
+    // XXX
+    
+    // ..?
+    if (path[depth] == "..") {
+      trace.pop_back();
+      depth++;
+      cur = cur->get_parent_inode();
+      dout(10) << "traverse: following .. back to " << *cur << endl;
+      continue;
+    }
+
+
+    // dentry
+    CDentry *dn = cur->dir->lookup(path[depth]);
+
+    // null and last_bit and xlocked by me?
+    if (dn && dn->is_null() && 
+        dn->is_xlockedbyme(req) &&
+        depth == path.depth()-1) {
+      dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl;
+      trace.push_back(dn);
+      break; // done!
+    }
+
+    if (dn && !dn->is_null()) {
+      // dentry exists.  xlocked?
+      if (!noperm && dn->is_xlockedbyother(req)) {
+        dout(10) << "traverse: xlocked dentry at " << *dn << endl;
+        cur->dir->add_waiter(CDIR_WAIT_DNREAD,
+                             path[depth],
+                             ondelay);
+        if (onfinish) delete onfinish;
+        return 1;
+      }
+
+      // do we have inode?
+      if (!dn->inode) {
+        assert(dn->is_remote());
+        // do i have it?
+        CInode *in = get_inode(dn->get_remote_ino());
+        if (in) {
+          dout(7) << "linking in remote in " << *in << endl;
+          dn->link_remote(in);
+        } else {
+          dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl;
+          open_remote_ino(dn->get_remote_ino(), req,
+                          ondelay);
+          return 1;
+        }        
+      }
+
+      // symlink?
+      if (dn->inode->is_symlink() &&
+          (follow_trailing_symlink || depth < path.depth()-1)) {
+        // symlink, resolve!
+        filepath sym = dn->inode->symlink;
+        dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl;
+
+        // break up path components
+        // /head/symlink/tail
+        filepath head = path.prefixpath(depth);
+        filepath tail = path.postfixpath(depth+1);
+        dout(10) << "traverse: path head = " << head << endl;
+        dout(10) << "traverse: path tail = " << tail << endl;
+        
+        if (symlinks_resolved.count(pair<CInode*,string>(dn->inode, tail.get_path()))) {
+          dout(10) << "already hit this symlink, bailing to avoid the loop" << endl;
+          return -ELOOP;
+        }
+        symlinks_resolved.insert(pair<CInode*,string>(dn->inode, tail.get_path()));
+
+        // start at root?
+        if (dn->inode->symlink[0] == '/') {
+          // absolute
+          trace.clear();
+          depth = 0;
+          path = tail;
+          dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl;
+        } else {
+          // relative
+          path = head;
+          path.append(sym);
+          path.append(tail);
+          dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl;
+        }
+        continue;        
+      } else {
+        // keep going.
+
+        // forwarder wants replicas?
+        if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) {
+          dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl;
+          
+          if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() &&
+              cur->dir->is_auth() && 
+              cur->dir->is_rep() &&
+              cur->dir->is_open_by(req->get_source().num()) &&
+              dn->get_inode()->is_auth()
+              ) {
+            assert(req->get_source().is_mds());
+            int from = req->get_source().num();
+            
+            if (dn->get_inode()->is_cached_by(from)) {
+              dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " 
+                       << MSG_ADDR_NICE(req->get_source()) << " dn " << *dn << endl; 
+            } else {
+              dout(10) << "traverse: REP replicating to " << MSG_ADDR_NICE(req->get_source()) << " dn " << *dn << endl;
+              MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino());
+              reply->add_dentry( dn->get_name(), !dn->can_read());
+              reply->add_inode( dn->inode->replicate_to( from ) );
+              mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE);
+            }
+          }
+        }
+            
+        trace.push_back(dn);
+        cur = dn->inode;
+        touch_inode(cur);
+        depth++;
+        continue;
+      }
+    }
+    
+    // MISS.  don't have it.
+
+    int dauth = cur->dir->dentry_authority( path[depth] );
+    dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl;
+    
+
+    if (dauth == whoami) {
+      // dentry is mine.
+      if (cur->dir->is_complete()) {
+        // file not found
+        delete ondelay;
+        if (onfinish) {
+          onfinish->finish(-ENOENT);
+          delete onfinish;
+        }
+        return -ENOENT;
+      } else {
+        
+        //wrong?
+        //if (onfail == MDS_TRAVERSE_DISCOVER) 
+        //  return -1;
+        
+        // directory isn't complete; reload
+        dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl;
+        touch_inode(cur);
+        mds->mdstore->fetch_dir(cur->dir, ondelay);
+        
+        mds->logger->inc("cmiss");
+
+        if (onfinish) delete onfinish;
+        return 1;
+      }
+    } else {
+      // dentry is not mine.
+      
+      /* no, let's let auth handle the discovery/replication ..
+      if (onfail == MDS_TRAVERSE_FORWARD && 
+          onfinish == 0 &&   // no funnyness
+          cur->dir->is_rep()) {
+        dout(5) << "trying to discover in popular dir " << *cur->dir << endl;
+        onfail = MDS_TRAVERSE_DISCOVER;
+      }
+      */
+
+      if ((onfail == MDS_TRAVERSE_DISCOVER ||
+           onfail == MDS_TRAVERSE_DISCOVERXLOCK)) {
+        // discover
+
+        filepath want = path.postfixpath(depth);
+        if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) {
+          dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
+        } else {
+          dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
+          
+          touch_inode(cur);
+        
+          mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+					      cur->ino(),
+					      want,
+					      false),
+				dauth, MDS_PORT_CACHE);
+          mds->logger->inc("dis");
+        }
+        
+        // delay processing of current request.
+        //  delay finish vs ondelay until result of traverse, so that ENOENT can be 
+        //  passed to onfinish if necessary
+        cur->dir->add_waiter(CDIR_WAIT_DENTRY, 
+                             path[depth], 
+                             new C_MDC_TraverseDiscover(onfinish, ondelay));
+        
+        mds->logger->inc("cmiss");
+        return 1;
+      } 
+      if (onfail == MDS_TRAVERSE_FORWARD) {
+        // forward
+        dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl;
+
+        if (is_client_req && cur->dir->is_rep()) {
+          dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl;
+          ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino());
+          req->clear_payload();  // reencode!
+        }
+
+        mds->send_message_mds(req, dauth, req->get_dest_port());
+        //show_imports();
+        
+        mds->logger->inc("cfw");
+        if (onfinish) delete onfinish;
+        delete ondelay;
+        return 2;
+      }    
+      if (onfail == MDS_TRAVERSE_FAIL) {
+        delete ondelay;
+        if (onfinish) {
+          onfinish->finish(-ENOENT);  // -ENOENT, but only because i'm not the authority!
+          delete onfinish;
+        }
+        return -ENOENT;  // not necessarily exactly true....
+      }
+    }
+    
+    assert(0);  // i shouldn't get here
+  }
+  
+  // success.
+  delete ondelay;
+  if (onfinish) {
+    onfinish->finish(0);
+    delete onfinish;
+  }
+  return 0;
+}
+
+
+
+void MDCache::open_remote_dir(CInode *diri,
+                              Context *fin) 
+{
+  dout(10) << "open_remote_dir on " << *diri << endl;
+  
+  assert(diri->is_dir());
+  assert(!diri->dir_is_auth());
+  assert(!diri->is_auth());
+  assert(diri->dir == 0);
+
+  filepath want;  // no dentries, i just want the dir open
+  mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+				      diri->ino(),
+				      want,
+				      true),  // need the dir open
+			diri->authority(), MDS_PORT_CACHE);
+
+  diri->add_waiter(CINODE_WAIT_DIR, fin);
+}
+
+
+
+class C_MDC_OpenRemoteInoLookup : public Context {
+  MDCache *mdc;
+  inodeno_t ino;
+  Message *req;
+  Context *onfinish;
+public:
+  vector<Anchor*> anchortrace;
+  C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) {
+    this->mdc = mdc;
+    this->ino = ino;
+    this->req = req;
+    this->onfinish = onfinish;
+  }
+  void finish(int r) {
+    assert(r == 0);
+    if (r == 0)
+      mdc->open_remote_ino_2(ino, req, anchortrace, onfinish);
+    else {
+      onfinish->finish(r);
+      delete onfinish;
+    }
+  }
+};
+
+void MDCache::open_remote_ino(inodeno_t ino,
+                              Message *req,
+                              Context *onfinish)
+{
+  dout(7) << "open_remote_ino on " << ino << endl;
+  
+  C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish);
+  mds->anchorclient->lookup(ino, c->anchortrace, c);
+}
+
+void MDCache::open_remote_ino_2(inodeno_t ino,
+                                Message *req,
+                                vector<Anchor*>& anchortrace,
+                                Context *onfinish)
+{
+  dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl;
+  
+  // construct path
+  filepath path;
+  for (unsigned i=0; i<anchortrace.size(); i++) 
+    path.add_dentry(anchortrace[i]->ref_dn);
+
+  dout(7) << " path is " << path << endl;
+
+  vector<CDentry*> trace;
+  int r = path_traverse(path, trace, false,
+                        req,
+                        onfinish,  // delay actually
+                        MDS_TRAVERSE_DISCOVER);
+  if (r > 0) return;
+  
+  onfinish->finish(r);
+  delete onfinish;
+}
+
+
+
+
+// path pins
+
+bool MDCache::path_pin(vector<CDentry*>& trace,
+                       Message *m,
+                       Context *c)
+{
+  // verify everything is pinnable
+  for (vector<CDentry*>::iterator it = trace.begin();
+       it != trace.end();
+       it++) {
+    CDentry *dn = *it;
+    if (!dn->is_pinnable(m)) {
+      // wait
+      if (c) {
+        dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl;
+        dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE,   
+                            dn->name,
+                            c);
+      } else {
+        dout(10) << "path_pin can't pin, no waiter, failing." << endl;
+      }
+      return false;
+    }
+  }
+
+  // pin!
+  for (vector<CDentry*>::iterator it = trace.begin();
+       it != trace.end();
+       it++) {
+    (*it)->pin(m);
+    dout(11) << "path_pinned " << *(*it) << endl;
+  }
+
+  delete c;
+  return true;
+}
+
+
+void MDCache::path_unpin(vector<CDentry*>& trace,
+                         Message *m)
+{
+  for (vector<CDentry*>::iterator it = trace.begin();
+       it != trace.end();
+       it++) {
+    CDentry *dn = *it;
+    dn->unpin(m);
+    dout(11) << "path_unpinned " << *dn << endl;
+
+    // did we completely unpin a waiter?
+    if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) {
+      // return state to sync, in case the unpinner flails
+      dn->lockstate = DN_LOCK_SYNC;
+
+      // run finisher right now to give them a fair shot.
+      dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name);
+    }
+  }
+}
+
+
+void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
+{
+  CInode *parent = in->get_parent_inode();
+  if (parent) {
+    make_trace(trace, parent);
+
+    CDentry *dn = in->get_parent_dn();
+    dout(15) << "make_trace adding " << *dn << endl;
+    trace.push_back(dn);
+  }
+}
+
+
+bool MDCache::request_start(Message *req,
+                            CInode *ref,
+                            vector<CDentry*>& trace)
+{
+  assert(active_requests.count(req) == 0);
+
+  // pin path
+  if (trace.size()) {
+    if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false;
+  }
+
+  dout(7) << "request_start " << *req << endl;
+
+  // add to map
+  active_requests[req].ref = ref;
+  if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace;
+
+  // request pins
+  request_pin_inode(req, ref);
+  
+  mds->logger->inc("req");
+
+  return true;
+}
+
+
+void MDCache::request_pin_inode(Message *req, CInode *in) 
+{
+  if (active_requests[req].request_pins.count(in) == 0) {
+    in->request_pin_get();
+    active_requests[req].request_pins.insert(in);
+  }
+}
+
+void MDCache::request_pin_dir(Message *req, CDir *dir) 
+{
+  if (active_requests[req].request_dir_pins.count(dir) == 0) {
+    dir->request_pin_get();
+    active_requests[req].request_dir_pins.insert(dir);
+  }
+}
+
+
+void MDCache::request_cleanup(Message *req)
+{
+  assert(active_requests.count(req) == 1);
+
+  // leftover xlocks?
+  if (active_requests[req].xlocks.size()) {
+    set<CDentry*> dns = active_requests[req].xlocks;
+
+    for (set<CDentry*>::iterator it = dns.begin();
+         it != dns.end();
+         it++) {
+      CDentry *dn = *it;
+      
+      dout(7) << "request_cleanup leftover xlock " << *dn << endl;
+      
+      mds->locker->dentry_xlock_finish(dn);
+      
+      // queue finishers
+      dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue);
+
+      // remove clean, null dentry?  (from a failed rename or whatever)
+      if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) {
+        dn->dir->remove_dentry(dn);
+      }
+    }
+    
+    assert(active_requests[req].xlocks.empty());  // we just finished finished them
+  }
+
+  // foreign xlocks?
+  if (active_requests[req].foreign_xlocks.size()) {
+    set<CDentry*> dns = active_requests[req].foreign_xlocks;
+    active_requests[req].foreign_xlocks.clear();
+    
+    for (set<CDentry*>::iterator it = dns.begin();
+         it != dns.end();
+         it++) {
+      CDentry *dn = *it;
+      
+      dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl;
+      assert(dn->is_xlocked());
+      int dauth = dn->dir->dentry_authority(dn->name);
+      MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid());
+      m->set_dn(dn->dir->ino(), dn->name);
+      mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
+    }
+  }
+
+  // unpin paths
+  for (map< CDentry*, vector<CDentry*> >::iterator it = active_requests[req].traces.begin();
+       it != active_requests[req].traces.end();
+       it++) {
+    path_unpin(it->second, req);
+  }
+  
+  // request pins
+  for (set<CInode*>::iterator it = active_requests[req].request_pins.begin();
+       it != active_requests[req].request_pins.end();
+       it++) {
+    (*it)->request_pin_put();
+  }
+  for (set<CDir*>::iterator it = active_requests[req].request_dir_pins.begin();
+       it != active_requests[req].request_dir_pins.end();
+       it++) {
+    (*it)->request_pin_put();
+  }
+
+  // remove from map
+  active_requests.erase(req);
+
+
+  // log some stats *****
+  mds->logger->set("c", lru.lru_get_size());
+  mds->logger->set("cpin", lru.lru_get_num_pinned());
+  mds->logger->set("ctop", lru.lru_get_top());
+  mds->logger->set("cbot", lru.lru_get_bot());
+  mds->logger->set("cptail", lru.lru_get_pintail());
+  //mds->logger->set("buf",buffer_total_alloc);
+
+  if (g_conf.log_pins) {
+    // pin
+    for (int i=0; i<CINODE_NUM_PINS; i++) {
+      mds->logger2->set(cinode_pin_names[i],
+                        cinode_pins[i]);
+    }
+    /*
+      for (map<int,int>::iterator it = cdir_pins.begin();
+      it != cdir_pins.end();
+      it++) {
+      //string s = "D";
+      //s += cdir_pin_names[it->first];
+      mds->logger2->set(//s, 
+      cdir_pin_names[it->first],
+      it->second);
+      }
+    */
+  }
+
+}
+
+void MDCache::request_finish(Message *req)
+{
+  dout(7) << "request_finish " << *req << endl;
+  request_cleanup(req);
+  delete req;  // delete req
+  
+  mds->logger->inc("reply");
+
+
+  //dump();
+}
+
+
+void MDCache::request_forward(Message *req, int who, int port)
+{
+  if (!port) port = MDS_PORT_SERVER;
+
+  dout(7) << "request_forward to " << who << " req " << *req << endl;
+  request_cleanup(req);
+  mds->send_message_mds(req, who, port);
+
+  mds->logger->inc("fw");
+}
+
+
+
+// ANCHORS
+
+class C_MDC_AnchorInode : public Context {
+  CInode *in;
+  
+public:
+  C_MDC_AnchorInode(CInode *in) {
+    this->in = in;
+  }
+  void finish(int r) {
+    if (r == 0) {
+      assert(in->inode.anchored == false);
+      in->inode.anchored = true;
+
+      in->state_clear(CINODE_STATE_ANCHORING);
+      in->put(CINODE_PIN_ANCHORING);
+      
+      in->mark_dirty();
+    }
+
+    // trigger
+    in->finish_waiting(CINODE_WAIT_ANCHORED, r);
+  }
+};
+
+void MDCache::anchor_inode(CInode *in, Context *onfinish)
+{
+  assert(in->is_auth());
+
+  // already anchoring?
+  if (in->state_test(CINODE_STATE_ANCHORING)) {
+    dout(7) << "anchor_inode already anchoring " << *in << endl;
+
+    // wait
+    in->add_waiter(CINODE_WAIT_ANCHORED,
+                   onfinish);
+
+  } else {
+    dout(7) << "anchor_inode anchoring " << *in << endl;
+
+    // auth: do it
+    in->state_set(CINODE_STATE_ANCHORING);
+    in->get(CINODE_PIN_ANCHORING);
+    
+    // wait
+    in->add_waiter(CINODE_WAIT_ANCHORED,
+                   onfinish);
+    
+    // make trace
+    vector<Anchor*> trace;
+    in->make_anchor_trace(trace);
+    
+    // do it
+    mds->anchorclient->create(in->ino(), trace, 
+                           new C_MDC_AnchorInode( in ));
+  }
+}
+
+
+void MDCache::handle_inode_link(MInodeLink *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  assert(in);
+
+  if (!in->is_auth()) {
+    assert(in->is_proxy());
+    dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl;
+    mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+    return;
+  }
+
+  dout(7) << "handle_inode_link on " << *in << endl;
+
+  if (!in->is_anchored()) {
+    assert(in->inode.nlink == 1);
+    dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl;
+    
+    anchor_inode(in,
+                 new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  in->inode.nlink++;
+  in->mark_dirty();
+
+  // reply
+  dout(7) << " nlink++, now " << in->inode.nlink++ << endl;
+
+  mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE);
+  delete m;
+}
+
+
+void MDCache::handle_inode_link_ack(MInodeLinkAck *m) 
+{
+  CInode *in = get_inode(m->get_ino());
+  assert(in);
+
+  dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl;
+  in->finish_waiting(CINODE_WAIT_LINK,
+                     m->is_success() ? 1:-1);
+}
+
+
+
+// REPLICAS
+
+
+void MDCache::handle_discover(MDiscover *dis) 
+{
+  int whoami = mds->get_nodeid();
+
+  // from me to me?
+  if (dis->get_asker() == whoami) {
+    dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl;
+    delete dis;
+    return;
+  }
+
+  CInode *cur = 0;
+  MDiscoverReply *reply = 0;
+  //filepath fullpath;
+
+  // get started.
+  if (dis->get_base_ino() == 0) {
+    // wants root
+    dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl;
+
+    assert(mds->get_nodeid() == 0);
+    assert(root->is_auth());
+
+    //fullpath = dis->get_want();
+
+
+    // add root
+    reply = new MDiscoverReply(0);
+    reply->add_inode( root->replicate_to( dis->get_asker() ) );
+    dout(10) << "added root " << *root << endl;
+
+    cur = root;
+    
+  } else {
+    // there's a base inode
+    cur = get_inode(dis->get_base_ino());
+    assert(cur);
+
+    if (dis->wants_base_dir()) {
+      dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl;
+    } else {
+      dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl;
+    }
+    
+    assert(cur->is_dir());
+    
+    // crazyness?
+    if (!cur->dir && !cur->is_auth()) {
+      int iauth = cur->authority();
+      dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl;
+      mds->send_message_mds( dis, iauth, MDS_PORT_CACHE);
+      return;
+    }
+
+    // frozen_dir?
+    if (!cur->dir && cur->is_frozen_dir()) {
+      dout(7) << "is frozen_dir, waiting" << endl;
+      cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, 
+                                        new C_MDS_RetryMessage(mds, dis));
+      return;
+    }
+
+    if (!cur->dir) 
+      cur->get_or_open_dir(mds);
+    assert(cur->dir);
+
+    dout(10) << "dir is " << *cur->dir << endl;
+    
+    // create reply
+    reply = new MDiscoverReply(cur->ino());
+  }
+
+  assert(reply);
+  assert(cur);
+  
+  /*
+  // first traverse and make sure we won't have to do any waiting
+  dout(10) << "traversing full discover path = " << fullpath << endl;
+  vector<CInode*> trav;
+  int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL);
+  if (r > 0) 
+    return;  // fw or delay
+  dout(10) << "traverse finish w/o blocking, continuing" << endl;
+  // ok, now we know we won't block on dentry locks or readdir.
+  */
+
+
+  // add content
+  // do some fidgeting to include a dir if they asked for the base dir, or just root.
+  for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) {
+    // add dir
+    if (reply->is_empty() && !dis->wants_base_dir()) {
+      dout(7) << "they don't want the base dir" << endl;
+    } else {
+      // is it actaully a dir at all?
+      if (!cur->is_dir()) {
+        dout(7) << "not a dir " << *cur << endl;
+        reply->set_flag_error_dir();
+        break;
+      }
+
+      // add dir
+      if (!cur->dir_is_auth()) {
+        dout(7) << *cur << " dir auth is someone else, i'm done" << endl;
+        break;
+      }
+      
+      // did we hit a frozen_dir?
+      if (!cur->dir && cur->is_frozen_dir()) {
+        dout(7) << *cur << " is frozen_dir, stopping" << endl;
+        break;
+      }
+      
+      if (!cur->dir) cur->get_or_open_dir(mds);
+      
+      reply->add_dir( new CDirDiscover( cur->dir, 
+                                        cur->dir->open_by_add( dis->get_asker() ) ) );
+      dout(7) << "added dir " << *cur->dir << endl;
+    }
+    if (dis->get_want().depth() == 0) break;
+    
+    // lookup dentry
+    int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) );
+    if (dentry_auth != mds->get_nodeid()) {
+      dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl;
+      break;      // that's it for us!
+    }
+
+    // get inode
+    CDentry *dn = cur->dir->lookup( dis->get_dentry(i) );
+    
+    /*
+    if (dn && !dn->can_read()) { // xlocked?
+      dout(7) << "waiting on " << *dn << endl;
+      cur->dir->add_waiter(CDIR_WAIT_DNREAD,
+                           dn->name,
+                           new C_MDS_RetryMessage(mds, dis));
+      return;
+    }
+    */
+    
+    if (dn) {
+      if (!dn->inode && dn->is_sync()) {
+        dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl;
+        reply->set_flag_error_dn( dis->get_dentry(i) );
+        break;   // don't replicate null but non-locked dentries.
+      }
+      
+      reply->add_dentry( dis->get_dentry(i), !dn->can_read() );
+      dout(7) << "added dentry " << *dn << endl;
+      
+      if (!dn->inode) break;  // we're done.
+    }
+
+    if (dn && dn->inode) {
+        CInode *next = dn->inode;
+        assert(next->is_auth());
+
+        // add inode
+        //int nonce = next->cached_by_add(dis->get_asker());
+        reply->add_inode( next->replicate_to( dis->get_asker() ) );
+        dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl;
+
+        // descend
+        cur = next;
+    } else {
+      // don't have inode?
+      if (cur->dir->is_complete()) {
+        // set error flag in reply
+        dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl;
+        reply->set_flag_error_dn( dis->get_dentry(i) );
+        break;
+      } else {
+        // readdir
+        dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl;
+
+        //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis));
+        //break; // send what we have so far
+
+        mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis));
+        return;
+      }
+    }
+  }
+       
+  // how did we do.
+  if (reply->is_empty()) {
+
+    // discard empty reply
+    delete reply;
+
+    if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) &&
+        !cur->dir->is_auth()) {
+      // fwd to dir auth
+      int dirauth = cur->dir->authority();
+      if (dirauth == dis->get_asker()) {
+        dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl;  // XXX FIXME is this right?
+        //assert(dis->get_asker() == dis->get_source());  //might be a weird other loop.  either way, asker has it.
+        delete dis;
+      } else {
+        dout(7) << "fwd to dir auth " << dirauth << endl;
+        mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE );
+      }
+      return;
+    }
+    
+    dout(7) << "i'm not auth or proxy, dropping (this empty reply).  i bet i just exported." << endl;
+    //assert(0);
+    
+  } else {
+    // send back to asker
+    dout(7) << "sending result back to asker mds" << dis->get_asker() << endl;
+    mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE);
+  }
+
+  // done.
+  delete dis;
+}
+
+
+void MDCache::handle_discover_reply(MDiscoverReply *m) 
+{
+  // starting point
+  CInode *cur;
+  list<Context*> finished, error;
+  
+  if (m->has_root()) {
+    // nowhere!
+    dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl;
+    assert(!root);
+    assert(m->get_base_ino() == 0);
+    assert(!m->has_base_dentry());
+    assert(!m->has_base_dir());
+    
+    // add in root
+    cur = new CInode(this, false);
+      
+    m->get_inode(0).update_inode(cur);
+    
+    // root
+    set_root( cur );
+    add_inode( cur );
+    dout(7) << " got root: " << *cur << endl;
+
+    // take waiters
+    finished.swap(waiting_for_root);
+  } else {
+    // grab inode
+    cur = get_inode(m->get_base_ino());
+    
+    if (!cur) {
+      dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl;
+      delete m;
+      return;
+    }
+    
+    dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl;
+  }
+
+  // fyi
+  if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl;
+  if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl;
+  dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl;
+  
+  // loop over discover results.
+  // indexese follow each ([[dir] dentry] inode) 
+  // can start, end with any type.
+  
+  for (int i=m->has_root(); i<m->get_depth(); i++) {
+    dout(10) << "discover_reply i=" << i << " cur " << *cur << endl;
+
+    // dir
+    if ((i >  0) ||
+        (i == 0 && m->has_base_dir())) {
+      if (cur->dir) {
+        // had it
+        /* this is strange, but it happens when:
+           we discover multiple dentries under a dir.
+           bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one).
+           this is actually good, since (dir aside) they're asking for different information.
+        */
+        dout(7) << "had " << *cur->dir;
+        m->get_dir(i).update_dir(cur->dir);
+        dout2(7) << ", now " << *cur->dir << endl;
+      } else {
+        // add it (_replica_)
+        cur->set_dir( new CDir(cur, mds, false) );
+        m->get_dir(i).update_dir(cur->dir);
+        dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl;
+
+        // get waiters
+        cur->take_waiting(CINODE_WAIT_DIR, finished);
+      }
+    }    
+
+    // dentry error?
+    if (i == m->get_depth()-1 && 
+        m->is_flag_error_dn()) {
+      // error!
+      assert(cur->is_dir());
+      if (cur->dir) {
+        dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl;
+        cur->dir->take_waiting(CDIR_WAIT_DENTRY,
+                               m->get_error_dentry(),
+                               error);
+      } else {
+        dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl;
+        cur->take_waiting(CINODE_WAIT_DIR, error);
+      }
+      break;
+    }
+
+    if (i >= m->get_num_dentries()) break;
+    
+    // dentry
+    dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl;
+
+    CDentry *dn = 0;
+    if (i > 0 || 
+        m->has_base_dentry()) {
+      dn = cur->dir->lookup( m->get_dentry(i) );
+      
+      if (dn) {
+        dout(7) << "had " << *dn << endl;
+      } else {
+        dn = cur->dir->add_dentry( m->get_dentry(i) );
+        if (m->get_dentry_xlock(i)) {
+          dout(7) << " new dentry is xlock " << *dn << endl;
+          dn->lockstate = DN_LOCK_XLOCK;
+          dn->xlockedby = 0;
+        }
+        dout(7) << "added " << *dn << endl;
+      }
+
+      cur->dir->take_waiting(CDIR_WAIT_DENTRY,
+                             m->get_dentry(i),
+                             finished);
+    }
+    
+    if (i >= m->get_num_inodes()) break;
+
+    // inode
+    dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl;
+    CInode *in = get_inode( m->get_inode(i).get_ino() );
+    assert(dn);
+    
+    if (in) {
+      dout(7) << "had " << *in << endl;
+      
+      // fix nonce
+      dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which  has " << m->get_inode(i).get_replica_nonce() << endl;
+      in->replica_nonce = m->get_inode(i).get_replica_nonce();
+      
+      if (dn && in != dn->inode) {
+        dout(7) << " but it's not linked via dentry " << *dn << endl;
+        // link
+        if (dn->inode) {
+          dout(7) << "dentry WAS linked to " << *dn->inode << endl;
+          assert(0);  // WTF.
+        }
+        dn->dir->link_inode(dn, in);
+      }
+    }
+    else {
+      assert(dn->inode == 0);  // better not be something else linked to this dentry...
+
+      // didn't have it.
+      in = new CInode(this, false);
+      
+      m->get_inode(i).update_inode(in);
+        
+      // link in
+      add_inode( in );
+      dn->dir->link_inode(dn, in);
+      
+      dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl;
+    }
+    
+    // onward!
+    cur = in;
+  }
+
+  // dir error at the end there?
+  if (m->is_flag_error_dir()) {
+    dout(7) << " flag_error on dir " << *cur << endl;
+    assert(!cur->is_dir());
+    cur->take_waiting(CINODE_WAIT_DIR, error);
+  }
+
+  // finish errors directly
+  finish_contexts(error, -ENOENT);
+
+  mds->queue_finished(finished);
+
+  // done
+  delete m;
+}
+
+
+
+
+
+
+
+
+/*
+int MDCache::send_inode_updates(CInode *in)
+{
+  assert(in->is_auth());
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    dout(7) << "sending inode_update on " << *in << " to " << *it << endl;
+    assert(*it != mds->get_nodeid());
+    mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE);
+  }
+
+  return 0;
+}
+
+
+void MDCache::handle_inode_update(MInodeUpdate *m)
+{
+  inodeno_t ino = m->get_ino();
+  CInode *in = get_inode(m->get_ino());
+  if (!in) {
+    //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl;
+    dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl;
+    MCacheExpire *expire = new MCacheExpire(mds->get_nodeid());
+    expire->add_inode(m->get_ino(), m->get_nonce());
+    mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE);
+    goto out;
+  }
+
+  if (in->is_auth()) {
+    dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl;
+    assert(0); // this should never happen
+  }
+  
+  dout(7) << "inode_update on " << *in << endl;
+
+  // update! NOTE dir_auth is unaffected by this.
+  in->decode_basic_state(m->get_payload());
+
+ out:
+  // done
+  delete m;
+}
+*/
+
+
+
+void MDCache::handle_cache_expire(MCacheExpire *m)
+{
+  int from = m->get_from();
+  int source = MSG_ADDR_NUM(m->get_source());
+  map<int, MCacheExpire*> proxymap;
+  
+  if (m->get_from() == source) {
+    dout(7) << "cache_expire from " << from << endl;
+  } else {
+    dout(7) << "cache_expire from " << from << " via " << source << endl;
+  }
+
+  // inodes
+  for (map<inodeno_t,int>::iterator it = m->get_inodes().begin();
+       it != m->get_inodes().end();
+       it++) {
+    CInode *in = get_inode(it->first);
+    int nonce = it->second;
+    
+    if (!in) {
+      dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl;
+      assert(in);  // i should be authority, or proxy .. and pinned
+    }  
+    if (!in->is_auth()) {
+      int newauth = in->authority();
+      dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl;
+      assert(newauth >= 0);
+      if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
+      assert(in->state_test(CINODE_STATE_PROXY));
+      if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
+      proxymap[newauth]->add_inode(it->first, it->second);
+      continue;
+    }
+    
+    // check nonce
+    if (from == mds->get_nodeid()) {
+      // my cache_expire, and the export_dir giving auth back to me crossed paths!  
+      // we can ignore this.  no danger of confusion since the two parties are both me.
+      dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME!  ignoring." << endl;
+    } 
+    else if (nonce == in->get_cached_by_nonce(from)) {
+      // remove from our cached_by
+      dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl;
+      in->cached_by_remove(from);
+      in->mds_caps_wanted.erase(from);
+      
+      // note: this code calls _eval more often than it needs to!
+      // fix lock
+      if (in->hardlock.is_gathering(from)) {
+        in->hardlock.gather_set.erase(from);
+        if (in->hardlock.gather_set.size() == 0)
+          mds->locker->inode_hard_eval(in);
+      }
+      if (in->filelock.is_gathering(from)) {
+        in->filelock.gather_set.erase(from);
+        if (in->filelock.gather_set.size() == 0)
+          mds->locker->inode_file_eval(in);
+      }
+      
+      // alone now?
+      if (!in->is_cached_by_anyone()) {
+        mds->locker->inode_hard_eval(in);
+        mds->locker->inode_file_eval(in);
+      }
+
+    } 
+    else {
+      // this is an old nonce, ignore expire.
+      dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl;
+      assert(in->get_cached_by_nonce(from) > nonce);
+    }
+  }
+
+  // dirs
+  for (map<inodeno_t,int>::iterator it = m->get_dirs().begin();
+       it != m->get_dirs().end();
+       it++) {
+    CInode *diri = get_inode(it->first);
+    CDir *dir = diri->dir;
+    int nonce = it->second;
+    
+    if (!dir) {
+      dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
+      assert(dir);  // i should be authority, or proxy ... and pinned
+    }  
+    if (!dir->is_auth()) {
+      int newauth = dir->authority();
+      dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl;
+      if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl;
+      assert(dir->is_proxy());
+      assert(newauth >= 0);
+      assert(dir->state_test(CDIR_STATE_PROXY));
+      if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
+      proxymap[newauth]->add_dir(it->first, it->second);
+      continue;
+    }
+    
+    // check nonce
+    if (from == mds->get_nodeid()) {
+      dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME!  ignoring" << endl;
+    } 
+    else if (nonce == dir->get_open_by_nonce(from)) {
+      // remove from our cached_by
+      dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl;
+      dir->open_by_remove(from);
+    } 
+    else {
+      // this is an old nonce, ignore expire.
+      dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl;
+      assert(dir->get_open_by_nonce(from) > nonce);
+    }
+  }
+
+  // send proxy forwards
+  for (map<int, MCacheExpire*>::iterator it = proxymap.begin();
+       it != proxymap.end();
+       it++) {
+    dout(7) << "sending proxy forward to " << it->first << endl;
+    mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
+  }
+
+  // done
+  delete m;
+}
+
+
+
+int MDCache::send_dir_updates(CDir *dir, bool bcast)
+{
+  // this is an FYI, re: replication
+
+  set<int> who = dir->open_by;
+  if (bcast) 
+    who = mds->get_mds_map()->get_mds();
+  
+  dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl;
+
+  string path;
+  dir->inode->make_path(path);
+
+  int whoami = mds->get_nodeid();
+  for (set<int>::iterator it = who.begin();
+       it != who.end();
+       it++) {
+    if (*it == whoami) continue;
+    //if (*it == except) continue;
+    dout(7) << "sending dir_update on " << *dir << " to " << *it << endl;
+
+    mds->send_message_mds(new MDirUpdate(dir->ino(),
+					 dir->dir_rep,
+					 dir->dir_rep_by,
+					 path,
+					 bcast),
+			  *it, MDS_PORT_CACHE);
+  }
+
+  return 0;
+}
+
+
+void MDCache::handle_dir_update(MDirUpdate *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  if (!in || !in->dir) {
+    dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl;
+
+    // discover it?
+    if (m->should_discover()) {
+      m->tried_discover();  // only once!
+      vector<CDentry*> trace;
+      filepath path = m->get_path();
+
+      dout(5) << "trying discover on dir_update for " << path << endl;
+
+      int r = path_traverse(path, trace, true,
+                            m, new C_MDS_RetryMessage(mds, m),
+                            MDS_TRAVERSE_DISCOVER);
+      if (r > 0)
+        return;
+      if (r == 0) {
+        assert(in);
+        open_remote_dir(in, new C_MDS_RetryMessage(mds, m));
+        return;
+      }
+      assert(0);
+    }
+
+    goto out;
+  }
+
+  // update
+  dout(5) << "dir_update on " << *in->dir << endl;
+  in->dir->dir_rep = m->get_dir_rep();
+  in->dir->dir_rep_by = m->get_dir_rep_by();
+  
+  // done
+ out:
+  delete m;
+}
+
+
+
+
+
+class C_MDC_DentryUnlink : public Context {
+public:
+  MDCache *mdc;
+  CDentry *dn;
+  CDir *dir;
+  Context *c;
+  C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) {
+    this->mdc = mdc;
+    this->dn = dn;
+    this->dir = dir;
+    this->c = c;
+  }
+  void finish(int r) {
+    assert(r == 0);
+    mdc->dentry_unlink_finish(dn, dir, c);
+  }
+};
+
+
+// NAMESPACE FUN
+
+void MDCache::dentry_unlink(CDentry *dn, Context *c)
+{
+  CDir *dir = dn->dir;
+  string dname = dn->name;
+
+  assert(dn->lockstate == DN_LOCK_XLOCK);
+
+  // i need the inode to do any of this properly
+  assert(dn->inode);
+
+  // log it
+  if (dn->inode) dn->inode->mark_unsafe();   // XXX ??? FIXME
+  mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode),
+                           NULL);    // FIXME FIXME FIXME
+
+  // tell replicas
+  if (dir->is_open_by_anyone()) {
+    for (set<int>::iterator it = dir->open_by_begin();
+         it != dir->open_by_end();
+         it++) {
+      dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl;
+      
+      mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE);
+    }
+
+    // don't need ack.
+  }
+
+
+  // inode deleted?
+  if (dn->is_primary()) {
+    assert(dn->inode->is_auth());
+    dn->inode->inode.nlink--;
+    
+    if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0);  // no hard links on dirs
+
+    // last link?
+    if (dn->inode->inode.nlink == 0) {
+      // truly dangling      
+      if (dn->inode->dir) {
+        // mark dir clean too, since it now dne!
+        assert(dn->inode->dir->is_auth());
+        dn->inode->dir->state_set(CDIR_STATE_DELETED);
+        dn->inode->dir->remove_null_dentries();
+        dn->inode->dir->mark_clean();
+      }
+
+      // mark it clean, it's dead
+      if (dn->inode->is_dirty())
+        dn->inode->mark_clean();
+      
+    } else {
+      // migrate to inode file
+      dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl;
+
+      // dangling but still linked.  
+      assert(dn->inode->is_anchored());
+
+      // unlink locally
+      CInode *in = dn->inode;
+      dn->dir->unlink_inode( dn );
+      dn->mark_dirty();
+
+      // mark it dirty!
+      in->mark_dirty();
+
+      // update anchor to point to inode file+mds
+      vector<Anchor*> atrace;
+      in->make_anchor_trace(atrace);
+      assert(atrace.size() == 1);   // it's dangling
+      mds->anchorclient->update(in->ino(), atrace, 
+                             new C_MDC_DentryUnlink(this, dn, dir, c));
+      return;
+    }
+  }
+  else if (dn->is_remote()) {
+    // need to dec nlink on primary
+    if (dn->inode->is_auth()) {
+      // awesome, i can do it
+      dout(7) << "remote target is local, nlink--" << endl;
+      dn->inode->inode.nlink--;
+      dn->inode->mark_dirty();
+
+      if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) ||
+          (!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) {
+        dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl;
+
+        // remove anchor (async)
+        mds->anchorclient->destroy(dn->inode->ino(), NULL);
+      }
+    } else {
+      int auth = dn->inode->authority();
+      dout(7) << "remote target is remote, sending unlink request to " << auth << endl;
+
+      mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()),
+			    auth, MDS_PORT_CACHE);
+
+      // unlink locally
+      CInode *in = dn->inode;
+      dn->dir->unlink_inode( dn );
+      dn->mark_dirty();
+
+      // add waiter
+      in->add_waiter(CINODE_WAIT_UNLINK, c);
+      return;
+    }
+  }
+  else 
+    assert(0);   // unlink on null dentry??
+ 
+  // unlink locally
+  dn->dir->unlink_inode( dn );
+  dn->mark_dirty();
+
+  // finish!
+  dentry_unlink_finish(dn, dir, c);
+}
+
+
+void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c)
+{
+  dout(7) << "dentry_unlink_finish on " << *dn << endl;
+  string dname = dn->name;
+
+  // unpin dir / unxlock
+  mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking
+  
+  // did i empty out an imported dir?
+  if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0) 
+    migrator->export_empty_import(dir);
+
+  // wake up any waiters
+  dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue);
+
+  c->finish(0);
+}
+
+
+
+
+void MDCache::handle_dentry_unlink(MDentryUnlink *m)
+{
+  CInode *diri = get_inode(m->get_dirino());
+  CDir *dir = 0;
+  if (diri) dir = diri->dir;
+
+  if (!diri || !dir) {
+    dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl;
+  }
+  else {
+    CDentry *dn = dir->lookup(m->get_dn());
+    if (!dn) {
+      dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl;
+    } else {
+      dout(7) << "handle_dentry_unlink on " << *dn << endl;
+      
+      // dir?
+      if (dn->inode) {
+        if (dn->inode->dir) {
+          dn->inode->dir->state_set(CDIR_STATE_DELETED);
+          dn->inode->dir->remove_null_dentries();
+        }
+      }
+      
+      string dname = dn->name;
+      
+      // unlink
+      dn->dir->remove_dentry(dn);
+      
+      // wake up
+      //dir->finish_waiting(CDIR_WAIT_DNREAD, dname);
+      dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue);
+    }
+  }
+
+  delete m;
+  return;
+}
+
+
+void MDCache::handle_inode_unlink(MInodeUnlink *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  assert(in);
+
+  // proxy?
+  if (in->is_proxy()) {
+    dout(7) << "handle_inode_unlink proxy on " << *in << endl;
+    mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+    return;
+  }
+  assert(in->is_auth());
+
+  // do it.
+  dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl;
+  assert(in->inode.nlink > 0);
+  in->inode.nlink--;
+
+  if (in->state_test(CINODE_STATE_DANGLING)) {
+    // already dangling.
+    // last link?
+    if (in->inode.nlink == 0) {
+      dout(7) << "last link, marking clean and removing anchor" << endl;
+      
+      in->mark_clean();       // mark it clean.
+      
+      // remove anchor (async)
+      mds->anchorclient->destroy(in->ino(), NULL);
+    }
+    else {
+      in->mark_dirty();
+    }
+  } else {
+    // has primary link still.
+    assert(in->inode.nlink >= 1);
+    in->mark_dirty();
+
+    if (in->inode.nlink == 1) {
+      dout(7) << "nlink=1, removing anchor" << endl;
+      
+      // remove anchor (async)
+      mds->anchorclient->destroy(in->ino(), NULL);
+    }
+  }
+
+  // ack
+  mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE);
+}
+
+void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  assert(in);
+
+  dout(7) << "handle_inode_unlink_ack on " << *in << endl;
+  in->finish_waiting(CINODE_WAIT_UNLINK, 0);
+}
+
+
+
+
+
+
+
+
+
+
+/*
+ * some import/export helpers
+ */
+
+/** con = get_auth_container(dir)
+ * Returns the directory in which authority is delegated for *dir.  
+ * This may be because a directory is an import, or because it is hashed
+ * and we are nested underneath an inode in that dir (that hashes to us).
+ * Thus do not assume con->is_auth()!  It is_auth() || is_hashed().
+ */
+CDir *MDCache::get_auth_container(CDir *dir)
+{
+  CDir *imp = dir;  // might be *dir
+
+  // find the underlying import or hash that delegates dir
+  while (true) {
+    if (imp->is_import()) break; // import
+    imp = imp->get_parent_dir();
+    assert(imp);
+    if (imp->is_hashed()) break; // hash
+  }
+
+  return imp;
+}
+
+
+void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s) 
+{
+  CDir *import = get_auth_container(dir);
+  find_nested_exports_under(import, dir, s);
+}
+
+void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s)
+{
+  dout(10) << "find_nested_exports for " << *dir << endl;
+  dout(10) << "find_nested_exports_under import " << *import << endl;
+
+  if (import == dir) {
+    // yay, my job is easy!
+    for (set<CDir*>::iterator p = nested_exports[import].begin();
+         p != nested_exports[import].end();
+         p++) {
+      CDir *nested = *p;
+      s.insert(nested);
+      dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+    }
+    return;
+  }
+
+  // ok, my job is annoying.
+  for (set<CDir*>::iterator p = nested_exports[import].begin();
+       p != nested_exports[import].end();
+       p++) {
+    CDir *nested = *p;
+    
+    dout(12) << "find_nested_exports checking " << *nested << endl;
+
+    // trace back to import, or dir
+    CDir *cur = nested->get_parent_dir();
+    while (!cur->is_import() || cur == dir) {
+      if (cur == dir) {
+        s.insert(nested);
+        dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+        break;
+      } else {
+        cur = cur->get_parent_dir();
+      }
+    }
+  }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// ==============================================================
+// debug crap
+
+
+void MDCache::show_imports()
+{
+  mds->balancer->show_imports();
+}
+
+
+void MDCache::show_cache()
+{
+  dout(7) << "show_cache" << endl;
+  for (hash_map<inodeno_t,CInode*>::iterator it = inode_map.begin();
+       it != inode_map.end();
+       it++) {
+    dout(7) << *((*it).second) << endl;
+    
+    CDentry *dn = (*it).second->get_parent_dn();
+    if (dn) 
+      dout(7) << "       dn " << *dn << endl;
+    if ((*it).second->dir) 
+      dout(7) << "   subdir " << *(*it).second->dir << endl;
+  }
+}
+
diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h
new file mode 100644
index 0000000000000..e62113312447f
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDCache.h
@@ -0,0 +1,282 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MDCACHE_H
+#define __MDCACHE_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <set>
+#include <ext/hash_map>
+
+#include "include/types.h"
+#include "include/filepath.h"
+
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+#include "Lock.h"
+
+
+class MDS;
+class Migrator;
+class Renamer;
+
+class Logger;
+
+class Message;
+
+class MDiscover;
+class MDiscoverReply;
+class MCacheExpire;
+class MDirUpdate;
+class MDentryUnlink;
+class MLock;
+
+
+class MClientRequest;
+
+
+// MDCache
+
+//typedef const char* pchar;
+
+
+
+/** active_request_t
+ * state we track for requests we are currently processing.
+ * mostly information about locks held, so that we can drop them all
+ * the request is finished or forwarded.  see request_*().
+ */
+typedef struct {
+  CInode *ref;                                // reference inode
+  set< CInode* >            request_pins;
+  set< CDir* >              request_dir_pins;
+  map< CDentry*, vector<CDentry*> > traces;   // path pins held
+  set< CDentry* >           xlocks;           // xlocks (local)
+  set< CDentry* >           foreign_xlocks;   // xlocks on foreign hosts
+} active_request_t;
+
+namespace __gnu_cxx {
+  template<> struct hash<Message*> {
+    size_t operator()(const Message *p) const { 
+      static hash<unsigned long> H;
+      return H((unsigned long)p); 
+    }
+  };
+}
+
+class MDCache {
+ protected:
+  // my master
+  MDS *mds;
+
+  // the cache
+  CInode                       *root;        // root inode
+  LRU                           lru;         // lru for expiring items
+  hash_map<inodeno_t,CInode*>   inode_map;   // map of inodes by ino            
+ 
+  // root
+  list<Context*>     waiting_for_root;
+
+  // imports, exports, and hashes.
+  set<CDir*>             imports;                // includes root (on mds0)
+  set<CDir*>             exports;
+  set<CDir*>             hashdirs;
+  map<CDir*,set<CDir*> > nested_exports;         // exports nested under imports _or_ hashdirs
+  
+  // active MDS requests
+  hash_map<Message*, active_request_t>   active_requests;
+  
+  // inode purging
+  map<inodeno_t, inode_t>         purging;
+  map<inodeno_t, list<Context*> > waiting_for_purge;
+
+  // shutdown crap
+  int shutdown_commits;
+  bool did_shutdown_exports;
+  friend class C_MDC_ShutdownCommit;
+
+  friend class CInode;
+  friend class Locker;
+  friend class Migrator;
+  friend class Renamer;
+  friend class MDBalancer;
+
+ public:
+  // subsystems
+  Migrator *migrator;
+  Renamer *renamer;
+
+ public:
+  MDCache(MDS *m);
+  ~MDCache();
+  
+  // debug
+  void log_stat(Logger *logger);
+
+  // root inode
+  CInode *get_root() { return root; }
+  void set_root(CInode *r);
+
+  void add_import(CDir *dir);
+  void remove_import(CDir *dir);
+
+  // cache
+  void set_cache_size(size_t max) { lru.lru_set_max(max); }
+  size_t get_cache_size() { return lru.lru_get_size(); }
+  bool trim(int max = -1);   // trim cache
+
+  // shutdown
+  void shutdown_start();
+  void shutdown_check();
+  bool shutdown_pass();
+  bool shutdown();                    // clear cache (ie at shutodwn)
+
+  // inode_map
+  bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; }
+  CInode* get_inode( inodeno_t ino ) {
+    if (have_inode(ino))
+      return inode_map[ ino ];
+    return NULL;
+  }
+  
+ public:
+  CInode *create_inode();
+  void add_inode(CInode *in);
+
+ protected:
+  void remove_inode(CInode *in);
+  void destroy_inode(CInode *in);
+  void touch_inode(CInode *in) {
+    // touch parent(s) too
+    if (in->get_parent_dir()) touch_inode(in->get_parent_dir()->inode);
+    
+    // top or mid, depending on whether i'm auth
+    if (in->is_auth())
+      lru.lru_touch(in);
+    else
+      lru.lru_midtouch(in);
+  }
+  void rename_file(CDentry *srcdn, CDentry *destdn);
+
+ public:
+  // inode purging
+  void purge_inode(inode_t& inode);
+  void purge_inode_finish(inodeno_t ino);
+  void purge_inode_finish_2(inodeno_t ino);
+  void waitfor_purge(inodeno_t ino, Context *c);
+  void start_recovered_purges();
+
+
+ protected:
+  // private methods
+  CDir *get_auth_container(CDir *in);
+  void find_nested_exports(CDir *dir, set<CDir*>& s);
+  void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
+
+
+ public:
+  int open_root(Context *c);
+  int path_traverse(filepath& path, vector<CDentry*>& trace, bool follow_trailing_sym,
+                    Message *req, Context *ondelay,
+                    int onfail,
+                    Context *onfinish=0,
+                    bool is_client_req = false);
+  void open_remote_dir(CInode *diri, Context *fin);
+  void open_remote_ino(inodeno_t ino, Message *req, Context *fin);
+  void open_remote_ino_2(inodeno_t ino, Message *req,
+                         vector<Anchor*>& anchortrace,
+                         Context *onfinish);
+
+  bool path_pin(vector<CDentry*>& trace, Message *m, Context *c);
+  void path_unpin(vector<CDentry*>& trace, Message *m);
+  void make_trace(vector<CDentry*>& trace, CInode *in);
+  
+  bool request_start(Message *req,
+                     CInode *ref,
+                     vector<CDentry*>& trace);
+  void request_cleanup(Message *req);
+  void request_finish(Message *req);
+  void request_forward(Message *req, int mds, int port=0);
+  void request_pin_inode(Message *req, CInode *in);
+  void request_pin_dir(Message *req, CDir *dir);
+
+  // anchors
+  void anchor_inode(CInode *in, Context *onfinish);
+  //void unanchor_inode(CInode *in, Context *c);
+
+  void handle_inode_link(class MInodeLink *m);
+  void handle_inode_link_ack(class MInodeLinkAck *m);
+
+  // == messages ==
+ public:
+  void dispatch(Message *m);
+
+ protected:
+  // -- replicas --
+  void handle_discover(MDiscover *dis);
+  void handle_discover_reply(MDiscoverReply *m);
+
+
+  // -- namespace --
+  // these handle logging, cache sync themselves.
+  // UNLINK
+ public:
+  void dentry_unlink(CDentry *in, Context *c);
+ protected:
+  void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c);
+  void handle_dentry_unlink(MDentryUnlink *m);
+  void handle_inode_unlink(class MInodeUnlink *m);
+  void handle_inode_unlink_ack(class MInodeUnlinkAck *m);
+  friend class C_MDC_DentryUnlink;
+
+
+
+  // -- misc auth --
+  int ino_proxy_auth(inodeno_t ino, 
+                     int frommds,
+                     map<CDir*, set<inodeno_t> >& inomap);
+  void do_ino_proxy(CInode *in, Message *m);
+  void do_dir_proxy(CDir *dir, Message *m);
+
+
+
+
+  // -- updates --
+  //int send_inode_updates(CInode *in);
+  //void handle_inode_update(MInodeUpdate *m);
+
+  int send_dir_updates(CDir *in, bool bcast=false);
+  void handle_dir_update(MDirUpdate *m);
+
+  void handle_cache_expire(MCacheExpire *m);
+
+
+
+  // == crap fns ==
+ public:
+  void dump() {
+    if (root) root->dump();
+  }
+
+  void show_imports();
+  void show_cache();
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDLog.cc b/branches/sage/cephmds2/mds/MDLog.cc
new file mode 100644
index 0000000000000..b272eb9a176d6
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDLog.cc
@@ -0,0 +1,371 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDLog.h"
+#include "MDS.h"
+#include "LogEvent.h"
+
+#include "osdc/Journaler.h"
+
+#include "common/LogType.h"
+#include "common/Logger.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+#define  derr(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+
+// cons/des
+
+LogType mdlog_logtype;
+
+MDLog::MDLog(MDS *m) 
+{
+  mds = m;
+  num_events = 0;
+  waiting_for_read = false;
+
+  max_events = g_conf.mds_log_max_len;
+
+  unflushed = 0;
+
+  // logger
+  char name[80];
+  sprintf(name, "mds%d.log", mds->get_nodeid());
+  logger = new Logger(name, &mdlog_logtype);
+
+  static bool didit = false;
+  if (!didit) {
+    mdlog_logtype.add_inc("add");
+    mdlog_logtype.add_inc("retire");    
+    mdlog_logtype.add_inc("obs");    
+    mdlog_logtype.add_inc("trim");    
+    mdlog_logtype.add_set("size");
+    mdlog_logtype.add_set("read");
+    mdlog_logtype.add_set("append");
+    mdlog_logtype.add_inc("lsum");
+    mdlog_logtype.add_inc("lnum");
+  }
+
+  // inode
+  memset(&log_inode, 0, sizeof(log_inode));
+  log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+  log_inode.layout = g_OSD_MDLogLayout;
+  
+  if (g_conf.mds_local_osd) {
+    log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD;
+    log_inode.layout.osd = mds->get_nodeid() + 10000;   // hack
+  }
+
+  // log streamer
+  journaler = new Journaler(log_inode, mds->objecter, logger);
+
+}
+
+
+MDLog::~MDLog()
+{
+  if (journaler) { delete journaler; journaler = 0; }
+  if (logger) { delete logger; logger = 0; }
+}
+
+
+void MDLog::reset()
+{
+  journaler->reset();
+}
+
+void MDLog::open(Context *c)
+{
+  dout(5) << "open discovering log bounds" << endl;
+  journaler->recover(c);
+}
+
+void MDLog::write_head(Context *c) 
+{
+  journaler->write_head(c);
+}
+
+
+void MDLog::submit_entry( LogEvent *le,
+			  Context *c ) 
+{
+  dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl;
+  
+  if (g_conf.mds_log) {
+    // encode it, with event type
+    bufferlist bl;
+    bl.append((char*)&le->_type, sizeof(le->_type));
+    le->encode_payload(bl);
+
+    // journal it.
+    journaler->append_entry(bl);
+
+    delete le;
+    num_events++;
+
+    logger->inc("add");
+    logger->set("size", num_events);
+    logger->set("append", journaler->get_write_pos());
+
+    if (c) {
+      unflushed = 0;
+      journaler->flush(c);
+    }
+    else
+      unflushed++;
+
+  } else {
+    // hack: log is disabled.
+    if (c) {
+      c->finish(0);
+      delete c;
+    }
+  }
+}
+
+void MDLog::wait_for_sync( Context *c )
+{
+  if (g_conf.mds_log) {
+    // wait
+    journaler->flush(c);
+  } else {
+    // hack: bypass.
+    c->finish(0);
+    delete c;
+  }
+}
+
+void MDLog::flush()
+{
+  if (unflushed)
+    journaler->flush();
+  unflushed = 0;
+
+  // trim
+  trim(NULL);
+}
+
+
+
+
+// trim
+
+class C_MDL_Trimmed : public Context {
+public:
+  MDLog *mdl;
+  LogEvent *le;
+
+  C_MDL_Trimmed(MDLog *mdl, LogEvent *le) {
+    this->mdl = mdl; 
+    this->le = le;
+  }
+  void finish(int res) {
+    mdl->_trimmed(le);
+  }
+};
+
+class C_MDL_Reading : public Context {
+public:
+  MDLog *mdl;
+  C_MDL_Reading(MDLog *m) {
+    mdl = m; 
+  }
+  void finish(int res) {
+    mdl->_did_read();
+  }
+};
+
+
+void MDLog::_did_read() 
+{
+  dout(5) << "_did_read()" << endl;
+  waiting_for_read = false;
+  trim(0);
+}
+
+void MDLog::_trimmed(LogEvent *le) 
+{
+  dout(7) << "  trimmed " << *le << endl;
+  
+  assert(le->can_expire(mds));
+
+  if (trimming.begin()->first == le->_end_off) {
+    // front!  we can expire the log a bit
+    journaler->set_expire_pos(le->_end_off);
+  }
+
+  trimming.erase(le->_end_off);
+  delete le;
+ 
+  logger->set("trim", trimming.size());
+  logger->set("read", journaler->get_read_pos());
+ 
+  trim(0);
+}
+
+
+
+void MDLog::trim(Context *c)
+{
+  // add waiter
+  if (c) 
+    trim_waiters.push_back(c);
+
+  // trim!
+  while (num_events > max_events) {
+    
+    off_t gap = journaler->get_write_pos() - journaler->get_read_pos();
+    dout(5) << "trim num_events " << num_events << " > max " << max_events
+	    << ", trimming " << trimming.size()
+	    << ", byte gap " << gap
+	    << endl;
+
+    if ((int)trimming.size() >= g_conf.mds_log_max_trimming) {
+      dout(7) << "trim  already trimming max, waiting" << endl;
+      return;
+    }
+    
+    bufferlist bl;
+    if (journaler->try_read_entry(bl)) {
+      // decode logevent
+      LogEvent *le = LogEvent::decode(bl);
+      le->_end_off = journaler->get_read_pos();
+      num_events--;
+
+      // we just read an event.
+      if (le->can_expire(mds) == true) {
+        // obsolete
+        dout(7) << "trim  obsolete: " << *le << endl;
+        delete le;
+        logger->inc("obs");
+      } else {
+        assert ((int)trimming.size() < g_conf.mds_log_max_trimming);
+
+        // trim!
+        dout(7) << "trim  trimming: " << *le << endl;
+        trimming[le->_end_off] = le;
+        le->retire(mds, new C_MDL_Trimmed(this, le));
+        logger->inc("retire");
+        logger->set("trim", trimming.size());
+      }
+      logger->set("read", journaler->get_read_pos());
+      logger->set("size", num_events);
+    } else {
+      // need to read!
+      if (!waiting_for_read) {
+        waiting_for_read = true;
+        dout(7) << "trim  waiting for read" << endl;
+        journaler->wait_for_readable(new C_MDL_Reading(this));
+      } else {
+        dout(7) << "trim  already waiting for read" << endl;
+      }
+      return;
+    }
+  }
+
+  dout(5) << "trim num_events " << num_events << " <= max " << max_events
+	  << ", trimming " << trimming.size()
+	  << ", done for now."
+	  << endl;
+  
+  // trimmed!
+  std::list<Context*> finished;
+  finished.swap(trim_waiters);
+  finish_contexts(finished, 0);
+}
+
+
+void MDLog::replay(Context *c)
+{
+  assert(journaler->is_active());
+
+  // start reading at the last known expire point.
+  journaler->set_read_pos( journaler->get_expire_pos() );
+
+  // empty?
+  if (journaler->get_read_pos() == journaler->get_write_pos()) {
+    dout(10) << "replay - journal empty, done." << endl;
+    if (c) {
+      c->finish(0);
+      delete c;
+    }
+    return;
+  }
+
+  // add waiter
+  if (c)
+    waitfor_replay.push_back(c);
+
+  // go!
+  dout(10) << "replay start, from " << journaler->get_read_pos()
+	   << " to " << journaler->get_write_pos() << endl;
+
+  assert(num_events == 0);
+
+  _replay(); 
+}
+
+class C_MDL_Replay : public Context {
+  MDLog *mdlog;
+public:
+  C_MDL_Replay(MDLog *l) : mdlog(l) {}
+  void finish(int r) { mdlog->_replay(); }
+};
+
+void MDLog::_replay()
+{
+  // read what's buffered
+  while (journaler->is_readable() &&
+	 journaler->get_read_pos() < journaler->get_write_pos()) {
+    // read it
+    off_t pos = journaler->get_read_pos();
+    bufferlist bl;
+    bool r = journaler->try_read_entry(bl);
+    assert(r);
+    
+    // unpack event
+    LogEvent *le = LogEvent::decode(bl);
+    num_events++;
+
+    if (le->has_happened(mds)) {
+      dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() 
+	       << " : " << *le << " : already happened" << endl;
+    } else {
+      dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() 
+	       << " : " << *le << " : applying" << endl;
+      le->replay(mds);
+    }
+    delete le;
+  }
+
+  // wait for read?
+  if (journaler->get_read_pos() < journaler->get_write_pos()) {
+    journaler->wait_for_readable(new C_MDL_Replay(this));
+    return;    
+  }
+
+  // done!
+  assert(journaler->get_read_pos() == journaler->get_write_pos());
+  dout(10) << "_replay - complete" << endl;
+
+  // move read pointer _back_ to expire pos, for eventual trimming
+  journaler->set_read_pos(journaler->get_expire_pos());
+
+  // kick waiter(s)
+  list<Context*> ls;
+  ls.swap(waitfor_replay);
+  finish_contexts(ls,0);  
+}
+
+
diff --git a/branches/sage/cephmds2/mds/MDLog.h b/branches/sage/cephmds2/mds/MDLog.h
new file mode 100644
index 0000000000000..37329a164e781
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDLog.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDLOG_H
+#define __MDLOG_H
+
+#include "include/types.h"
+#include "include/Context.h"
+
+#include <list>
+
+//#include <ext/hash_map>
+//using __gnu_cxx::hash_mapset;
+
+class Journaler;
+class LogEvent;
+class MDS;
+
+class Logger;
+
+/*
+namespace __gnu_cxx {
+  template<> struct hash<LogEvent*> {
+    size_t operator()(const LogEvent *p) const { 
+      static hash<unsigned long> H;
+      return H((unsigned long)p); 
+    }
+  };
+}
+*/
+
+class MDLog {
+ protected:
+  MDS *mds;
+  size_t num_events; // in events
+  size_t max_events;
+
+  int unflushed;
+
+  inode_t log_inode;
+  Journaler *journaler;
+
+  
+  //hash_map<LogEvent*>  trimming;       // events currently being trimmed
+  map<off_t, LogEvent*> trimming;
+  std::list<Context*>  trim_waiters;   // contexts waiting for trim
+  bool                 trim_reading;
+
+  bool waiting_for_read;
+  friend class C_MDL_Reading;
+
+  Logger *logger;
+  
+  list<Context*> waitfor_replay;
+
+ public:
+  MDLog(MDS *m);
+  ~MDLog();
+  
+  void set_max_events(size_t max) { max_events = max; }
+  size_t get_max_events() { return max_events; }
+  size_t get_num_events() { return num_events + trimming.size(); }
+
+  void submit_entry( LogEvent *e, Context *c = 0 );
+  void wait_for_sync( Context *c );
+  void flush();
+
+  void trim(Context *c);
+  void _did_read();
+  void _trimmed(LogEvent *le);
+
+  void reset();  // fresh, empty log! 
+  void open(Context *onopen);
+  void write_head(Context *onfinish);
+
+  void replay(Context *onfinish);
+  void _replay();
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc
new file mode 100644
index 0000000000000..a487d6469eb7a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDS.cc
@@ -0,0 +1,692 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "include/types.h"
+#include "common/Clock.h"
+
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+#include "osdc/Objecter.h"
+#include "osdc/Filer.h"
+
+#include "MDSMap.h"
+
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDStore.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "IdAllocator.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "AnchorTable.h"
+#include "AnchorClient.h"
+
+#include "common/Logger.h"
+#include "common/LogType.h"
+
+#include "common/Timer.h"
+
+#include "messages/MMDSMap.h"
+#include "messages/MMDSBoot.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MGenericMessage.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+
+
+LogType mds_logtype, mds_cache_logtype;
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " "
+#define  derr(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " "
+
+
+
+
+
+// cons/des
+MDS::MDS(int whoami, Messenger *m, MonMap *mm) {
+  this->whoami = whoami;
+
+  monmap = mm;
+  messenger = m;
+
+  mdsmap = new MDSMap;
+  osdmap = new OSDMap;
+
+  objecter = new Objecter(messenger, monmap, osdmap);
+  filer = new Filer(objecter);
+
+  mdcache = new MDCache(this);
+  mdstore = new MDStore(this);
+  mdlog = new MDLog(this);
+  balancer = new MDBalancer(this);
+
+  anchorclient = new AnchorClient(messenger, mdsmap);
+
+  // alloc
+  {
+    inode_t id_inode;
+    memset(&id_inode, 0, sizeof(id_inode));
+    id_inode.ino = MDS_INO_IDS_OFFSET + whoami;
+    id_inode.layout = g_OSD_FileLayout;
+    idalloc = new IdAllocator(this, id_inode);
+  }
+
+  // hack: anchortable on mds0.
+  if (whoami == 0) 
+    anchormgr = new AnchorTable(this);
+  else
+    anchormgr = 0;
+
+
+  server = new Server(this);
+  locker = new Locker(this, mdcache);
+
+
+  req_rate = 0;
+
+  state = STATE_BOOTING;
+
+  last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now();
+
+  // log
+  string name;
+  name = "mds";
+  int w = whoami;
+  if (w >= 1000) name += ('0' + ((w/1000)%10));
+  if (w >= 100) name += ('0' + ((w/100)%10));
+  if (w >= 10) name += ('0' + ((w/10)%10));
+  name += ('0' + ((w/1)%10));
+
+  logger = new Logger(name, (LogType*)&mds_logtype);
+
+  mds_logtype.add_inc("req");
+  mds_logtype.add_inc("reply");
+  mds_logtype.add_inc("fw");
+  mds_logtype.add_inc("cfw");
+
+  mds_logtype.add_set("l");
+  mds_logtype.add_set("q");
+  mds_logtype.add_set("popanyd");
+  mds_logtype.add_set("popnest");
+
+  mds_logtype.add_inc("lih");
+  mds_logtype.add_inc("lif");
+
+  mds_logtype.add_set("c");
+  mds_logtype.add_set("ctop");
+  mds_logtype.add_set("cbot");
+  mds_logtype.add_set("cptail");  
+  mds_logtype.add_set("cpin");
+  mds_logtype.add_inc("cex");
+  mds_logtype.add_inc("dis");
+  mds_logtype.add_inc("cmiss");
+
+  mds_logtype.add_set("buf");
+  mds_logtype.add_inc("cdir");
+  mds_logtype.add_inc("fdir");
+
+  mds_logtype.add_inc("iex");
+  mds_logtype.add_inc("iim");
+  mds_logtype.add_inc("ex");
+  mds_logtype.add_inc("im");
+  mds_logtype.add_inc("imex");  
+  mds_logtype.add_set("nex");
+  mds_logtype.add_set("nim");
+
+  
+  char n[80];
+  sprintf(n, "mds%d.cache", whoami);
+  logger2 = new Logger(n, (LogType*)&mds_cache_logtype);
+  
+
+  // i'm ready!
+  messenger->set_dispatcher(this);
+}
+
+MDS::~MDS() {
+  if (mdcache) { delete mdcache; mdcache = NULL; }
+  if (mdstore) { delete mdstore; mdstore = NULL; }
+  if (mdlog) { delete mdlog; mdlog = NULL; }
+  if (balancer) { delete balancer; balancer = NULL; }
+  if (idalloc) { delete idalloc; idalloc = NULL; }
+  if (anchormgr) { delete anchormgr; anchormgr = NULL; }
+  if (anchorclient) { delete anchorclient; anchorclient = NULL; }
+  if (osdmap) { delete osdmap; osdmap = 0; }
+
+  if (filer) { delete filer; filer = 0; }
+  if (objecter) { delete objecter; objecter = 0; }
+  if (messenger) { delete messenger; messenger = NULL; }
+
+  if (logger) { delete logger; logger = 0; }
+  if (logger2) { delete logger2; logger2 = 0; }
+
+}
+
+
+void MDS::send_message_mds(Message *m, int mds, int port, int fromport)
+{
+  if (port && !fromport) 
+    fromport = port;
+  messenger->send_message(m, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), port, fromport);
+}
+
+
+int MDS::init()
+{
+  // request osd map
+  dout(5) << "requesting mds and osd maps from mon" << endl;
+  int mon = monmap->pick_mon();
+  messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  return 0;
+}
+
+
+void MDS::handle_mds_map(MMDSMap *m)
+{
+  map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
+
+  dout(1) << "handle_mds_map epoch " << p->first << endl;
+  mdsmap->decode(p->second);
+
+  delete m;
+  
+  if (is_booting()) {
+    // we need an osdmap too.
+    int mon = monmap->pick_mon();
+    messenger->send_message(new MOSDGetMap(0),
+			    MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  }
+}
+
+void MDS::handle_osd_map(MOSDMap *m)
+{
+  // process locally
+  objecter->handle_osd_map(m);
+  
+  if (is_booting()) {
+    // we got our maps.  mkfs for recovery?
+    if (g_conf.mkfs)
+      boot_mkfs();
+    else 
+      boot_recover();
+  }
+  
+  // pass on to clients
+  for (set<int>::iterator it = clientmap.get_mount_set().begin();
+       it != clientmap.get_mount_set().end();
+       it++) {
+    MOSDMap *n = new MOSDMap;
+    n->maps = m->maps;
+    n->incremental_maps = m->incremental_maps;
+    messenger->send_message(n, MSG_ADDR_CLIENT(*it), clientmap.get_inst(*it));
+  }
+}
+
+
+class C_MDS_MkfsFinish : public Context {
+  MDS *mds;
+public:
+  C_MDS_MkfsFinish(MDS *m) : mds(m) {}
+  void finish(int r) { mds->boot_mkfs_finish(); }
+};
+
+void MDS::boot_mkfs()
+{
+  dout(3) << "boot_mkfs" << endl;
+
+  C_Gather *fin = new C_Gather(new C_MDS_MkfsFinish(this));
+  
+  if (whoami == 0) {
+    dout(3) << "boot_mkfs - creating root inode and dir" << endl;
+
+    // create root inode.
+    mdcache->open_root(0);
+    CInode *root = mdcache->get_root();
+    assert(root);
+    
+    // force empty root dir
+    CDir *dir = root->dir;
+    dir->mark_complete();
+    dir->mark_dirty();
+
+    // save it
+    mdstore->commit_dir(dir, fin->new_sub());
+  }
+  
+  // start with a fresh journal
+  dout(10) << "boot_mkfs creating fresh journal" << endl;
+  mdlog->reset();
+  mdlog->write_head(fin->new_sub());
+
+  // fixme: fake out idalloc (reset, pretend loaded)
+  dout(10) << "boot_mkfs creating fresh idalloc table" << endl;
+  idalloc->reset();
+  idalloc->save(fin->new_sub());
+  
+  // fixme: fake out anchortable
+  if (mdsmap->get_anchortable() == whoami) {
+    dout(10) << "boot_mkfs creating fresh anchortable" << endl;
+    anchormgr->reset();
+    anchormgr->save(fin->new_sub());
+  }
+}
+
+void MDS::boot_mkfs_finish()
+{
+  dout(3) << "boot_mkfs_finish" << endl;
+  mark_active();
+}
+
+
+class C_MDS_BootRecover : public Context {
+  MDS *mds;
+  int nextstep;
+public:
+  C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {}
+  void finish(int r) { mds->boot_recover(nextstep); }
+};
+
+void MDS::boot_recover(int step)
+{
+  if (is_booting()) 
+    state = STATE_RECOVERING;
+
+  switch (step) {
+  case 0:
+    if (whoami == 0) {
+      dout(2) << "boot_recover " << step << ": creating root inode" << endl;
+      mdcache->open_root(0);
+      step = 1;
+      // fall-thru
+    } else {
+      // FIXME
+      assert(0);
+    }
+
+  case 1:
+    dout(2) << "boot_recover " << step << ": opening idalloc" << endl;
+    idalloc->load(new C_MDS_BootRecover(this, 2));
+    break;
+
+  case 2:
+    if (mdsmap->get_anchortable() == whoami) {
+      dout(2) << "boot_recover " << step << ": opening anchor table" << endl;
+      anchormgr->load(new C_MDS_BootRecover(this, 3));
+      break;
+    } else {
+      dout(2) << "boot_recover " << step << ": i have no anchor table" << endl;
+      step++;
+    }
+    // fall-thru
+
+  case 3:
+    dout(2) << "boot_recover " << step << ": opening mds log" << endl;
+    mdlog->open(new C_MDS_BootRecover(this, 4));
+    break;
+    
+  case 4:
+    dout(2) << "boot_recover " << step << ": replaying mds log" << endl;
+    mdlog->replay(new C_MDS_BootRecover(this, 5));
+    break;
+
+  case 5:
+    dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl;
+    mdcache->start_recovered_purges();
+    step++;
+    // fall-thru
+
+  case 6:
+    dout(2) << "boot_recover " << step << ": done." << endl;
+    mark_active();
+  }
+}
+
+
+
+void MDS::mark_active()
+{
+  dout(3) << "mark_active" << endl;
+  state = STATE_ACTIVE;
+  finish_contexts(waitfor_active);  // kick waiters
+}
+
+
+
+
+
+int MDS::shutdown_start()
+{
+  dout(1) << "shutdown_start" << endl;
+  derr(0) << "mds shutdown start" << endl;
+
+  for (set<int>::iterator p = mdsmap->get_mds().begin();
+       p != mdsmap->get_mds().end();
+       p++) {
+    dout(1) << "sending MShutdownStart to mds" << *p << endl;
+    send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART),
+		     *p, MDS_PORT_MAIN);
+  }
+
+  if (idalloc) idalloc->shutdown();
+  
+  handle_shutdown_start(NULL);
+  return 0;
+}
+
+
+void MDS::handle_shutdown_start(Message *m)
+{
+  dout(1) << " handle_shutdown_start" << endl;
+
+  // set flag
+  state = STATE_STOPPING;
+
+  mdcache->shutdown_start();
+  
+  // save anchor table
+  if (whoami == 0)
+    anchormgr->save(0);  // FIXME FIXME
+
+  // flush log
+  mdlog->set_max_events(0);
+  mdlog->trim(NULL);
+
+  if (m) delete m;
+
+  //g_conf.debug_mds = 10;
+}
+
+
+
+int MDS::shutdown_final()
+{
+  dout(1) << "shutdown" << endl;
+  
+  state = STATE_STOPPED;
+  
+  // shut down cache
+  mdcache->shutdown();
+
+  // tell monitor
+  messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
+			  MSG_ADDR_MON(0), monmap->get_inst(0));
+
+  // shut down messenger
+  messenger->shutdown();
+
+  return 0;
+}
+
+
+
+
+void MDS::dispatch(Message *m)
+{
+  // make sure we advacne the clock
+  g_clock.now();
+
+  // process
+  mds_lock.Lock();
+  my_dispatch(m);
+  mds_lock.Unlock();
+}
+
+
+
+void MDS::my_dispatch(Message *m)
+{
+
+  switch (m->get_dest_port()) {
+    
+  case MDS_PORT_ANCHORMGR:
+    anchormgr->dispatch(m);
+    break;
+  case MDS_PORT_ANCHORCLIENT:
+    anchorclient->dispatch(m);
+    break;
+    
+  case MDS_PORT_CACHE:
+    mdcache->dispatch(m);
+    break;
+  case MDS_PORT_LOCKER:
+    locker->dispatch(m);
+    break;
+
+  case MDS_PORT_MIGRATOR:
+    mdcache->migrator->dispatch(m);
+    break;
+  case MDS_PORT_RENAMER:
+    mdcache->renamer->dispatch(m);
+    break;
+
+  case MDS_PORT_BALANCER:
+    balancer->proc_message(m);
+    break;
+    
+  case MDS_PORT_MAIN:
+    proc_message(m);
+    break;
+
+  case MDS_PORT_SERVER:
+    server->dispatch(m);
+    break;
+
+  default:
+    dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl;
+    assert(0);
+  }
+
+
+  // HACK FOR NOW
+  /*
+  static bool did_heartbeat_hack = false;
+  if (!shutting_down && !shut_down &&
+      false && 
+      !did_heartbeat_hack) {
+    osdmonitor->initiate_heartbeat();
+    did_heartbeat_hack = true;
+  }
+  */
+
+
+  if (is_active()) {
+    // flush log to disk after every op.  for now.
+    mdlog->flush();
+
+    // trim cache
+    mdcache->trim();
+  }
+  
+  // finish any triggered contexts
+  if (finished_queue.size()) {
+    dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl;
+    list<Context*> ls;
+    ls.splice(ls.begin(), finished_queue);
+    assert(finished_queue.empty());
+    finish_contexts(ls);
+  }
+
+  
+
+  // hash root?
+  if (false &&
+      mdcache->get_root() &&
+      mdcache->get_root()->dir &&
+      !(mdcache->get_root()->dir->is_hashed() || 
+        mdcache->get_root()->dir->is_hashing())) {
+    dout(0) << "hashing root" << endl;
+    mdcache->migrator->hash_dir(mdcache->get_root()->dir);
+  }
+
+
+  // periodic crap (1-second resolution)
+  static utime_t last_log = g_clock.recent_now();
+  utime_t now = g_clock.recent_now();
+  if (is_active() && 
+      last_log.sec() != now.sec()) {
+
+    // log
+    last_log = now;
+    mds_load_t load = balancer->get_load();
+
+    req_rate = logger->get("req");
+
+    logger->set("l", (int)load.mds_load());
+    logger->set("q", messenger->get_dispatch_queue_len());
+    logger->set("buf", buffer_total_alloc);
+
+    mdcache->log_stat(logger);
+
+
+    // balance?
+    static int num_bal_times = g_conf.mds_bal_max;
+    static utime_t first = g_clock.recent_now();
+    utime_t elapsed = now;
+    elapsed -= first;
+    if (true && 
+        whoami == 0 &&
+        (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) && 
+        !is_stopping() && !is_stopped() &&
+        now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) {
+      last_balancer_heartbeat = now;
+      balancer->send_heartbeat();
+      num_bal_times--;
+    }
+
+    // hash?
+    if (true &&
+        g_conf.num_mds > 1 &&
+        now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) {
+      last_balancer_hash = now;
+      balancer->do_hashing();
+    }
+    
+    
+
+    // HACK to test hashing stuff
+    if (false) {
+      static map<int,int> didhash;
+      if (elapsed.sec() > 15 && !didhash[whoami]) {
+        CInode *in = mdcache->get_inode(100000010);
+        if (in && in->dir) {
+          if (in->dir->is_auth()) 
+            mdcache->migrator->hash_dir(in->dir);
+          didhash[whoami] = 1;
+        }
+      }
+      if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) {
+        CInode *in = mdcache->get_inode(100000010);
+        if (in && in->dir) {
+          if (in->dir->is_auth() && in->dir->is_hashed())
+            mdcache->migrator->unhash_dir(in->dir);
+          didhash[whoami] = 2;
+        }
+      }
+    }
+
+
+
+  }
+
+  // HACK to force export to test foreign renames
+  if (false && whoami == 0) {
+    static bool didit = false;
+    
+    // 7 to 1
+    CInode *in = mdcache->get_inode(1001);
+    if (in && in->is_dir() && !didit) {
+      CDir *dir = in->get_or_open_dir(this);
+      if (dir->is_auth()) {
+        dout(1) << "FORCING EXPORT" << endl;
+        mdcache->migrator->export_dir(dir,1);
+        didit = true;
+      }
+    }
+  }
+
+
+
+  // shut down?
+  if (is_stopping()) {
+    if (mdcache->shutdown_pass()) {
+      dout(7) << "shutdown_pass=true, finished w/ shutdown" << endl;
+      shutdown_final();      
+    }
+  }
+
+}
+
+
+void MDS::proc_message(Message *m)
+{
+  switch (m->get_type()) {
+    // OSD ===============
+    /*
+  case MSG_OSD_MKFS_ACK:
+    handle_osd_mkfs_ack(m);
+    return;
+    */
+  case MSG_OSD_OPREPLY:
+    objecter->handle_osd_op_reply((class MOSDOpReply*)m);
+    return;
+  case MSG_OSD_MAP:
+    handle_osd_map((MOSDMap*)m);
+    return;
+
+
+    // MDS
+  case MSG_MDS_MAP:
+    handle_mds_map((MMDSMap*)m);
+    return;
+
+  case MSG_MDS_SHUTDOWNSTART:    // mds0 -> mds1+
+    handle_shutdown_start(m);
+    return;
+
+
+
+  case MSG_PING:
+    handle_ping((MPing*)m);
+    return;
+  }
+
+}
+
+
+
+
+
+
+void MDS::handle_ping(MPing *m)
+{
+  dout(10) << " received ping from " << MSG_ADDR_NICE(m->get_source()) << " with seq " << m->seq << endl;
+
+  messenger->send_message(new MPingAck(m),
+                          m->get_source(), m->get_source_inst());
+  
+  delete m;
+}
+
diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h
new file mode 100644
index 0000000000000..1581d9c4049ca
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDS.h
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MDS_H
+#define __MDS_H
+
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+#include <ostream>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "msg/Dispatcher.h"
+#include "include/types.h"
+#include "include/Context.h"
+#include "common/DecayCounter.h"
+#include "common/Logger.h"
+#include "common/Mutex.h"
+
+#include "mon/MonMap.h"
+
+#include "ClientMap.h"
+
+
+#define MDS_PORT_MAIN     0
+#define MDS_PORT_SERVER   1
+#define MDS_PORT_CACHE    2
+#define MDS_PORT_LOCKER   3
+#define MDS_PORT_STORE    4
+#define MDS_PORT_BALANCER 5
+#define MDS_PORT_MIGRATOR 6
+#define MDS_PORT_RENAMER  7
+
+#define MDS_PORT_ANCHORCLIENT 10
+#define MDS_PORT_ANCHORMGR    11
+
+
+#define MDS_INO_ROOT              1
+#define MDS_INO_PGTABLE           2
+#define MDS_INO_LOG_OFFSET        0x100
+#define MDS_INO_IDS_OFFSET        0x200
+#define MDS_INO_INODEFILE_OFFSET  0x300
+#define MDS_INO_ANCHORTABLE       0x400
+#define MDS_INO_BASE              0x1000
+
+#define MDS_TRAVERSE_FORWARD       1
+#define MDS_TRAVERSE_DISCOVER      2    // skips permissions checks etc.
+#define MDS_TRAVERSE_DISCOVERXLOCK 3    // succeeds on (foreign?) null, xlocked dentries.
+#define MDS_TRAVERSE_FAIL          4
+
+
+class filepath;
+
+class MDSMap;
+class OSDMap;
+class Objecter;
+class Filer;
+
+class Server;
+class Locker;
+class AnchorTable;
+class AnchorClient;
+class MDCache;
+class MDStore;
+class MDLog;
+class MDBalancer;
+class IdAllocator;
+
+class CInode;
+class CDir;
+class CDentry;
+
+class Messenger;
+class Message;
+
+class MClientRequest;
+class MClientReply;
+class MHashReaddir;
+class MHashReaddirReply;
+
+
+
+
+class MDS : public Dispatcher {
+ public:
+  Mutex        mds_lock;
+
+ protected:
+  int          whoami;
+
+ public:
+  Messenger    *messenger;
+  MDSMap       *mdsmap;
+  MonMap       *monmap;
+  OSDMap       *osdmap;
+  Objecter     *objecter;
+  Filer        *filer;       // for reading/writing to/from osds
+
+  ClientMap    clientmap;
+
+  // sub systems
+  Server       *server;
+  MDCache      *mdcache;
+  Locker       *locker;
+  MDStore      *mdstore;
+  MDLog        *mdlog;
+  MDBalancer   *balancer;
+
+  IdAllocator  *idalloc;
+
+  AnchorTable  *anchormgr;
+  AnchorClient *anchorclient;
+
+  Logger       *logger, *logger2;
+
+
+
+ protected:
+  // -- MDS state --
+  static const int STATE_BOOTING       = 1;  // fetching mds and osd maps
+  static const int STATE_MKFS          = 2;  // creating a file system
+  static const int STATE_RECOVERING    = 3;  // recovering mds log
+  static const int STATE_ACTIVE        = 4;  // up and active!
+  static const int STATE_STOPPING      = 5;
+  static const int STATE_STOPPED       = 6;
+
+  int state;
+  list<Context*> waitfor_active;
+
+public:
+  void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); }
+
+  bool is_booting() { return state == STATE_BOOTING; }
+  bool is_recovering() { return state == STATE_RECOVERING; }
+  bool is_active() { return state == STATE_ACTIVE; }
+  bool is_stopping() { return state == STATE_STOPPING; }
+  bool is_stopped() { return state == STATE_STOPPED; }
+
+  void mark_active();
+
+
+  // -- waiters --
+  list<Context*> finished_queue;
+
+  void queue_finished(Context *c) {
+    finished_queue.push_back(c);
+  }
+  void queue_finished(list<Context*>& ls) {
+    finished_queue.splice( finished_queue.end(), ls );
+  }
+  
+
+
+  // shutdown crap
+  int req_rate;
+
+  // ino's and fh's
+ public:
+
+  int get_req_rate() { return req_rate; }
+
+ protected:
+
+  friend class MDStore;
+
+  
+ public:
+
+ protected:
+  utime_t   last_balancer_heartbeat, last_balancer_hash;
+  
+ public:
+  MDS(int whoami, Messenger *m, MonMap *mm);
+  ~MDS();
+
+  // who am i etc
+  int get_nodeid() { return whoami; }
+  MDSMap *get_mds_map() { return mdsmap; }
+  OSDMap *get_osd_map() { return osdmap; }
+
+  void send_message_mds(Message *m, int mds, int port=0, int fromport=0);
+
+  // start up, shutdown
+  int init();
+
+  void boot_mkfs();      
+  void boot_mkfs_finish();
+  void boot_recover(int step=0);   
+
+  int shutdown_start();
+  int shutdown_final();
+
+  int hash_dentry(inodeno_t ino, const string& s) {
+    return 0; // fixme
+  }
+  
+
+  // messages
+  void proc_message(Message *m);
+  virtual void dispatch(Message *m);
+  void my_dispatch(Message *m);
+
+  // special message types
+  void handle_ping(class MPing *m);
+
+  void handle_mds_map(class MMDSMap *m);
+
+  void handle_shutdown_start(Message *m);
+
+  // osds
+  void handle_osd_getmap(Message *m);
+  void handle_osd_map(class MOSDMap *m);
+
+};
+
+
+
+class C_MDS_RetryMessage : public Context {
+  Message *m;
+  MDS *mds;
+public:
+  C_MDS_RetryMessage(MDS *mds, Message *m) {
+    assert(m);
+    this->m = m;
+    this->mds = mds;
+  }
+  virtual void finish(int r) {
+    mds->my_dispatch(m);
+  }
+};
+
+
+ostream& operator<<(ostream& out, MDS& mds);
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h
new file mode 100644
index 0000000000000..6117e6943d3c7
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDSMap.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDSMAP_H
+#define __MDSMAP_H
+
+#include "common/Clock.h"
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+#include <set>
+#include <map>
+#include <string>
+using namespace std;
+
+class MDSMap {
+ protected:
+  epoch_t epoch;
+  utime_t ctime;
+
+  int anchortable;
+
+  set<int> all_mds;
+  set<int> down_mds;
+  map<int,entity_inst_t> mds_inst;
+
+  friend class MDSMonitor;
+
+ public:
+  MDSMap() : epoch(0), anchortable(0) {}
+
+  epoch_t get_epoch() const { return epoch; }
+  void inc_epoch() { epoch++; }
+
+  const utime_t& get_ctime() const { return ctime; }
+
+  int get_anchortable() const { return anchortable; }
+
+  int get_num_mds() const { return all_mds.size(); }
+  int get_num_up_mds() const { return all_mds.size() - down_mds.size(); }
+
+  const set<int>& get_mds() const { return all_mds; }
+  const set<int>& get_down_mds() const { return down_mds; }
+
+  bool is_down(int m) const { return down_mds.count(m); }
+  bool is_up(int m) const { return !is_down(m); }
+
+  const entity_inst_t& get_inst(int m) {
+    assert(mds_inst.count(m));
+    return mds_inst[m];
+  }
+  bool get_inst(int m, entity_inst_t& inst) { 
+    if (mds_inst.count(m)) {
+      inst = mds_inst[m];
+      return true;
+    } 
+    return false;
+  }
+
+  // serialize, unserialize
+  void encode(bufferlist& blist) {
+    blist.append((char*)&epoch, sizeof(epoch));
+    blist.append((char*)&ctime, sizeof(ctime));
+    blist.append((char*)&anchortable, sizeof(anchortable));
+    
+    _encode(all_mds, blist);
+    _encode(down_mds, blist);
+    _encode(mds_inst, blist);
+  }
+  
+  void decode(bufferlist& blist) {
+    int off = 0;
+    blist.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    blist.copy(off, sizeof(ctime), (char*)&ctime);
+    off += sizeof(ctime);
+    blist.copy(off, sizeof(anchortable), (char*)&anchortable);
+    off += sizeof(anchortable);
+    
+    _decode(all_mds, blist, off);
+    _decode(down_mds, blist, off);
+    _decode(mds_inst, blist, off);
+  }
+
+
+  /*** mapping functions ***/
+
+  int hash_dentry( inodeno_t dirino, const string& dn );  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDStore.cc b/branches/sage/cephmds2/mds/MDStore.cc
new file mode 100644
index 0000000000000..432d56751b643
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDStore.cc
@@ -0,0 +1,786 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "MDStore.h"
+#include "MDS.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "MDSMap.h"
+
+#include "osd/OSDMap.h"
+#include "osdc/Filer.h"
+
+#include "msg/Message.h"
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store "
+
+
+/*
+ * separate hashed dir slices into "regions"
+ */
+size_t get_hash_offset(int hashcode) {
+  if (hashcode < 0)   
+    return 0;  // not hashed
+  else
+    return (size_t)(1<<30) * (size_t)(1+hashcode);
+}
+
+
+
+
+// ==========================================================================
+// FETCH
+
+
+class C_MDS_Fetch : public Context {
+ protected:
+  MDStore *ms;
+  inodeno_t ino;
+
+ public:
+  C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() {
+    this->ms = ms;
+    this->ino = ino;
+  }
+  
+  void finish(int result) {
+    ms->fetch_dir_2( result, ino );
+  }
+};
+
+/** fetch_dir(dir, context)
+ * public call to fetch a dir.
+ */
+void MDStore::fetch_dir( CDir *dir,
+                         Context *c )
+{
+  dout(7) << "fetch_dir " << *dir << " context is " << c << endl;
+  assert(dir->is_auth() ||
+         dir->is_hashed());
+
+  // wait
+  if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c);
+  
+  // already fetching?
+  if (dir->state_test(CDIR_STATE_FETCHING)) {
+    dout(7) << "already fetching " << *dir << "; waiting" << endl;
+    return;
+  }
+  
+  // state
+  dir->state_set(CDIR_STATE_FETCHING);
+  
+  // stats
+  mds->logger->inc("fdir");
+  
+  // create return context
+  Context *fin = new C_MDS_Fetch( this, dir->ino() );
+  if (dir->is_hashed()) 
+    fetch_dir_hash( dir, fin, mds->get_nodeid());   // hashed
+  else 
+    fetch_dir_hash( dir, fin );                     // normal
+}
+
+/*
+ * called by low level fn when it's fetched.
+ * fix up dir state.
+ */
+void MDStore::fetch_dir_2( int result, 
+                           inodeno_t ino)
+{
+  CInode *idir = mds->mdcache->get_inode(ino);
+  
+  if (!idir || result < 0) return;  // hmm!  nevermind i guess.
+
+  assert(idir);
+  CDir *dir = idir->dir;
+  assert(dir);
+  
+  // dir is now complete
+  dir->state_set(CDIR_STATE_COMPLETE);
+  dir->state_clear(CDIR_STATE_FETCHING);
+
+  // finish
+  list<Context*> finished;
+  dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished);
+  finish_contexts(finished, result);
+}
+
+
+/** low level methods **/
+
+class C_MDS_FetchHash : public Context {
+protected:
+  MDS *mds;
+  inode_t inode;
+  int hashcode;
+  Context *context;
+  
+public:
+  bufferlist bl;
+  bufferlist bl2;
+  
+  C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() {
+    this->mds = mds;
+    this->inode = inode;
+    this->hashcode = hashcode;
+    this->context = c;
+  }
+  
+  void finish(int result) {
+    assert(result>0);
+
+    // combine bufferlists bl + bl2 -> bl
+    bl.claim_append(bl2);
+
+    // did i get the whole thing?
+    size_t size;
+    bl.copy(0, sizeof(size_t), (char*)&size);
+    size_t got = bl.length() - sizeof(size);
+    size_t left = size - got;
+    size_t from = bl.length();
+
+    // what part of dir are we getting?
+    from += get_hash_offset(hashcode);
+    
+    if (got >= size) {
+      // done.
+      mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode );
+    }
+    else {
+      // read the rest!
+      dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl;
+      
+      // create return context
+      C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode );
+      fin->bl.claim( bl );
+      mds->filer->read(inode,
+                       from, left, 
+                       &fin->bl2,
+                       fin );
+      return;
+    }
+  }
+};
+
+/** fetch_dir_hash
+ * low level method.
+ * fetch part of a dir.  either the whole thing if hashcode is -1, or a specific 
+ * hash segment.
+ */
+void MDStore::fetch_dir_hash( CDir *dir,
+                              Context *c, 
+                              int hashcode)
+{
+  dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl;
+  
+  // create return context
+  C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode );
+  
+  // grab first stripe bit (which had better be more than 16 bytes!)
+  assert(dir->get_inode()->inode.layout.stripe_size >= 16);
+  mds->filer->read(dir->get_inode()->inode,
+                   get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size, 
+                   &fin->bl,
+                   fin );
+}
+
+void MDStore::fetch_dir_hash_2( bufferlist& bl,
+                                inode_t& inode,
+                                Context *c,                           
+                                int hashcode)
+{
+  CInode *idir = mds->mdcache->get_inode(inode.ino);
+  if (!idir) {
+    dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl;
+    c->finish(-1);
+    delete c;
+    return;
+  } 
+
+  if (!idir->dir_is_auth() ||
+      !idir->dir) {
+    dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl;
+    c->finish(-1);
+    delete c;
+    return;
+  } 
+
+  // make sure we have a CDir
+  CDir *dir = idir->get_or_open_dir(mds);
+  
+  // do it
+  dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl;
+  
+  // parse buffer contents into cache
+  dout(15) << "bl is " << bl << endl;
+
+  int off = 0;
+  size_t size;
+  __uint32_t num;
+  version_t got_version;
+  int got_hashcode;
+  bl.copy(off, sizeof(size), (char*)&size);
+  off += sizeof(size);
+  assert(bl.length() >= size + sizeof(size));  
+  bl.copy(off, sizeof(num), (char*)&num);
+  off += sizeof(num);
+  bl.copy(off, sizeof(got_version), (char*)&got_version);
+  off += sizeof(got_version);
+  bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode);
+  off += sizeof(got_hashcode);
+
+  assert(got_hashcode == hashcode);  
+  
+  int buflen = bl.length();
+  
+  dout(10) << "  " << num << " items in " << size << " bytes" << endl;
+
+  unsigned parsed = 0;
+  while (parsed < num) {
+    assert(off < buflen && num > 0);
+    parsed++;
+    
+    dout(24) << " " << parsed << "/" << num << " pos " << off << endl;
+
+    // dentry
+    string dname;
+    ::_decode(dname, bl, off);
+    dout(24) << "parse filename '" << dname << "'" << endl;
+    
+    CDentry *dn = dir->lookup(dname);  // existing dentry?
+    
+    char type = bl[off];
+    ++off;
+    if (type == 'L') {
+      // hard link
+      inodeno_t ino;
+      bl.copy(off, sizeof(ino), (char*)&ino);
+      off += sizeof(ino);
+
+      // what to do?
+      if (hashcode >= 0) {
+        int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+        assert(dentryhashcode == hashcode);
+      }
+
+      if (dn) {
+        if (dn->get_inode() == 0) {
+          // negative dentry?
+          dout(12) << "readdir had NEG dentry " << dname << endl;
+        } else {
+          // had dentry
+          dout(12) << "readdir had dentry " << dname << endl;
+        }
+        continue;
+      }
+
+      // (remote) link
+      CDentry *dn = dir->add_dentry( dname, ino );
+
+      // link to inode?
+      CInode *in = mds->mdcache->get_inode(ino);   // we may or may not have it.
+      if (in) {
+        dn->link_remote(in);
+        dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl;
+      } else {
+        dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl;
+      }
+    } 
+    else if (type == 'I') {
+      // inode
+      
+      // parse out inode
+      inode_t inode;
+      bl.copy(off, sizeof(inode), (char*)&inode);
+      off += sizeof(inode);
+
+      string symlink;
+      if (inode.is_symlink())
+        ::_decode(symlink, bl, off);
+      
+      // what to do?
+      if (hashcode >= 0) {
+        int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+        assert(dentryhashcode == hashcode);
+      }
+      
+      if (dn) {
+        if (dn->get_inode() == 0) {
+          // negative dentry?
+          dout(12) << "readdir had NEG dentry " << dname << endl;
+        } else {
+          // had dentry
+          dout(12) << "readdir had dentry " << dname << endl;
+
+	  // under water?
+	  if (dn->get_inode()->get_parent_dir_version() <= got_version) {
+	    dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl;
+	    dn->get_inode()->mark_clean();
+	    dn->mark_clean();
+	  }
+        }
+        continue;
+      }
+      
+      // add inode
+      CInode *in = 0;
+      if (mds->mdcache->have_inode(inode.ino)) {
+        in = mds->mdcache->get_inode(inode.ino);
+        dout(12) << "readdir got (but i already had) " << *in 
+		 << " mode " << in->inode.mode 
+		 << " mtime " << in->inode.mtime << endl;
+      } else {
+        // inode
+        in = new CInode(mds->mdcache);
+	in->inode = inode;
+        
+        // symlink?
+        if (in->is_symlink()) {
+          in->symlink = symlink;
+        }
+        
+        // add 
+        mds->mdcache->add_inode( in );
+      }
+
+      // link
+      dir->add_dentry( dname, in );
+      dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl;
+    }
+    else {
+      dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) 
+	      << " at pos " << off << endl;
+      assert(0);
+    }
+  }
+  dout(15) << "parsed " << parsed << endl;
+  
+  if (c) {
+    c->finish(0);
+    delete c;
+  }
+}
+
+
+
+
+// ==================================================================
+// COMMIT
+
+class C_MDS_CommitDirVerify : public Context {
+public:
+  MDS *mds;
+  inodeno_t ino;
+  version_t version;
+  Context *c;
+  
+  C_MDS_CommitDirVerify( MDS *mds, 
+                        inodeno_t ino, 
+                        version_t version,
+                        Context *c) {
+    this->mds = mds;
+    this->c = c;
+    this->version = version;
+    this->ino = ino;
+  }
+  
+  virtual void finish(int r) {
+
+    if (r >= 0) {
+      CInode *in = mds->mdcache->get_inode(ino);
+      assert(in && in->dir);
+      if (in && in->dir && in->dir->is_auth()) {
+        dout(7) << "CommitDirVerify: current version = " << in->dir->get_version() << endl;
+        dout(7) << "CommitDirVerify:  last committed = " << in->dir->get_last_committed_version() << endl;
+           dout(7) << "CommitDirVerify:        required = " << version << endl;
+        
+        if (in->dir->get_last_committed_version() >= version) {
+          dout(7) << "my required version is safe, done." << endl;
+        } else { 
+          dout(7) << "my required version is still not safe, committing again." << endl;
+
+          // what was requested isn't committed yet.
+          mds->mdstore->commit_dir(in->dir, 
+                                   version,
+                                   c);
+          return;
+        }
+      }
+    }
+    
+    // must have exported ors omethign!
+    dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl;
+    if (c) {
+      c->finish(-1);
+      delete c;
+    }
+  }
+};
+
+class C_MDS_CommitDirFinish : public Context {
+ protected:
+  MDStore *ms;
+  CDir *dir;
+  version_t version;
+
+ public:
+
+  C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() {
+    this->ms = ms;
+    this->dir = dir;
+    this->version = dir->get_version();   // just for sanity check later
+  }
+  
+  void finish(int result) {
+    ms->commit_dir_2( result, dir, version );
+  }
+};
+
+
+void MDStore::commit_dir( CDir *dir,
+                          Context *c )
+{
+  assert(dir->is_dirty());
+  
+  // commit thru current version
+  commit_dir(dir, dir->get_version(), c);
+}
+
+void MDStore::commit_dir( CDir *dir,
+                          version_t version,
+                          Context *c )
+{
+  assert(dir->is_auth() ||
+         dir->is_hashed());
+  
+  // already committing?
+  if (dir->state_test(CDIR_STATE_COMMITTING)) {
+    // already mid-commit!
+    dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl;
+    dout(7) << "  current version = " << dir->get_version() << endl;
+    dout(7) << "requested version = " << version << endl;
+
+    assert(version >= dir->get_last_committed_version());  // why would we request _old_ one?
+
+    dir->add_waiter(CDIR_WAIT_COMMITTED, 
+                    new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+    return;
+  }
+
+  if (!dir->can_auth_pin()) {
+    // something must be frozen up the hiearchy!
+    dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl;
+    dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+                    new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+    return;
+  }
+
+
+  // is it complete?
+  if (!dir->is_complete()) {
+    dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl;
+    // fetch dir first
+    fetch_dir(dir, 
+              new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+    return;
+  }
+
+
+  // ok go
+  dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl;
+
+  // add waiter
+  if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c);
+
+  // get continuation ready
+  Context *fin = new C_MDS_CommitDirFinish(this, dir);
+  
+  // state
+  dir->state_set(CDIR_STATE_COMMITTING);
+  dir->set_committing_version(); 
+
+  // stats
+  mds->logger->inc("cdir");
+
+  if (dir->is_hashed()) {
+    // hashed
+    commit_dir_slice( dir, fin, mds->get_nodeid() );
+  } else {
+    // non-hashed
+    commit_dir_slice( dir, fin );
+  }
+}
+
+void MDStore::commit_dir_2( int result,
+                            CDir *dir,
+                            version_t committed_version)
+{
+  dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl;
+  assert(committed_version == dir->get_committing_version());
+
+  // remember which version is now safe
+  dir->set_last_committed_version(committed_version);
+  
+  // is the dir now clean?
+  if (committed_version == dir->get_version())
+    dir->mark_clean();
+ 
+  dir->state_clear(CDIR_STATE_COMMITTING);
+
+  // finish
+  dir->finish_waiting(CDIR_WAIT_COMMITTED);
+}
+
+
+
+
+// low-level committer (hashed or normal)
+
+class C_MDS_CommitSlice : public Context {
+ protected:
+  MDStore *ms;
+  CDir *dir;
+  Context *c;
+  int hashcode;
+  version_t version;
+
+public:
+  bufferlist bl;
+
+  C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() {
+    this->ms = ms;
+    this->dir = dir;
+    this->c = c;
+    this->hashcode = w;
+    version = dir->get_version();
+  }
+  
+  void finish(int result) {
+    ms->commit_dir_slice_2( result, dir, c, version, hashcode );
+  }
+};
+
+
+void MDStore::commit_dir_slice( CDir *dir,
+                               Context *c,
+                               int hashcode)
+{
+  if (hashcode >= 0) {
+    assert(dir->is_hashed());
+    dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl;
+  } else {
+    assert(dir->is_auth());
+    dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl;
+  }
+  
+  // get continuation ready
+  C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode);
+  
+  // fill buffer
+  __uint32_t num = 0;
+  
+  bufferlist dirdata;
+
+  version_t v = dir->get_version();
+  dirdata.append((char*)&v, sizeof(v));
+  dirdata.append((char*)&hashcode, sizeof(hashcode));
+  
+  for (CDir_map_t::iterator it = dir->begin();
+       it != dir->end();
+       it++) {
+    CDentry *dn = it->second;
+
+    if (hashcode >= 0) {
+      int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+      if (dentryhashcode != hashcode) continue;
+    }
+
+    // put dentry in this version
+    if (dn->is_dirty()) {
+      dn->float_parent_dir_version( dir->get_version() );
+      dout(12) << " dirty dn " << *dn << " now " << dn->get_parent_dir_version() << endl;
+    }
+    
+    if (dn->is_null()) continue;  // skipping negative entry
+
+    // primary or remote?
+    if (dn->is_remote()) {
+
+      inodeno_t ino = dn->get_remote_ino();
+      dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl;
+
+      // name, marker, ion
+      dirdata.append( it->first.c_str(), it->first.length() + 1);
+      dirdata.append( "L", 1 );         // remote link
+      dirdata.append((char*)&ino, sizeof(ino));
+
+    } else {
+      // primary link
+      CInode *in = dn->get_inode();
+      assert(in);
+
+      dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl;
+  
+      // name, marker, inode, [symlink string]
+      dirdata.append( it->first.c_str(), it->first.length() + 1);
+      dirdata.append( "I", 1 );         // inode
+      dirdata.append( (char*) &in->inode, sizeof(inode_t));
+      
+      if (in->is_symlink()) {
+        // include symlink destination!
+        dout(18) << "    inlcuding symlink ptr " << in->symlink << endl;
+        dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1);
+      }
+      
+      // put inode in this dir version
+      if (in->is_dirty()) {
+        in->float_parent_dir_version( dir->get_version() );
+        dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl;
+
+	in->set_committing_version( in->get_version() );
+	assert(in->get_last_committed_version() < in->get_committing_version());
+      } else {
+	assert(in->get_committing_version() == in->get_version());
+      }
+
+    }
+
+    num++;
+  }
+  dout(14) << "num " << num << endl;
+  
+  // put count in buffer
+  //bufferlist bl;
+  size_t size = sizeof(num) + dirdata.length();
+  fin->bl.append((char*)&size, sizeof(size));
+  fin->bl.append((char*)&num, sizeof(num));
+  fin->bl.claim_append(dirdata);  //.c_str(), dirdata.length());
+  assert(fin->bl.length() == size + sizeof(size));
+  
+  // pin inode
+  dir->auth_pin();
+  
+  // submit to osd
+  mds->filer->write( dir->get_inode()->inode,
+                     0, fin->bl.length(), 
+                     fin->bl,
+                     0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write
+                     NULL, fin ); // on safe
+}
+
+
+void MDStore::commit_dir_slice_2( int result,
+                                 CDir *dir,
+                                 Context *c,
+                                 version_t committed_version,
+                                 int hashcode )
+{
+  dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl;
+  
+  // mark inodes and dentries clean too (if we committed them!)
+  list<CDentry*> null_clean;
+  for (CDir_map_t::iterator it = dir->begin();
+       it != dir->end(); ) {
+    CDentry *dn = it->second;
+    it++;
+    
+    if (hashcode >= 0) {
+      int dentryhashcode = mds->hash_dentry( dir->ino(), dn->get_name() );
+      if (dentryhashcode != hashcode) continue;
+    }
+
+    // dentry
+    if (committed_version > dn->get_parent_dir_version()) {
+      dout(15) << " dir " << committed_version << " > dn " << dn->get_parent_dir_version() << " still clean " << *dn << endl;
+      assert(!dn->is_dirty());
+    }
+    else if (dn->get_parent_dir_version() == committed_version) {
+      dout(15) << " dir " << committed_version << " == dn " << dn->get_parent_dir_version() << " now clean " << *dn << endl;
+      if (dn->is_dirty())
+        dn->mark_clean();     // might not but could be dirty
+      
+      // remove, if it's null and unlocked
+      if (dn->is_null() && dn->is_sync()) {
+        dout(15) << "   removing clean and null " << *dn << endl;
+        null_clean.push_back(dn);
+        continue;
+      }
+    } else {
+      dout(15) << " dir " << committed_version << " < dn " << dn->get_parent_dir_version() << " still dirty " << *dn << endl;
+      assert(committed_version < dn->get_parent_dir_version());
+      //assert(dn->is_dirty() || !dn->is_sync());  // -OR- we did a fetch_dir in order to do a newer commit...
+    }
+
+    // only do primary...
+    if (!dn->is_primary()) continue;
+    
+    CInode *in = dn->get_inode();
+    assert(in);
+    assert(in->is_auth());
+    
+    if (in->get_committing_version())
+      in->set_committed_version();
+
+    if (committed_version > in->get_parent_dir_version()) {
+      dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl;
+      assert(!in->is_dirty());
+    }
+    else if (in->get_parent_dir_version() == committed_version) {
+      dout(15) << " dir " << committed_version << " == inode " << in->get_parent_dir_version() << " now clean " << *(in) << endl;
+      in->mark_clean();     // might not but could be dirty
+    } else {
+      dout(15) << " dir " << committed_version << " < inode " << in->get_parent_dir_version() << " still dirty " << *(in) << endl;
+      assert(committed_version < in->get_parent_dir_version());
+      //assert(in->is_dirty());  // -OR- we did a fetch_dir in order to do a newer commit...
+    }
+  }
+
+  // remove null clean dentries
+  for (list<CDentry*>::iterator it = null_clean.begin();
+       it != null_clean.end();
+       it++) 
+    dir->remove_dentry(*it);
+  
+  // unpin
+  dir->auth_unpin();
+
+  // finish
+  if (c) {
+    c->finish(0);
+    delete c;
+  }
+}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/mds/MDStore.h b/branches/sage/cephmds2/mds/MDStore.h
new file mode 100644
index 0000000000000..fe7553608a975
--- /dev/null
+++ b/branches/sage/cephmds2/mds/MDStore.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MDSTORE_H
+#define __MDSTORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+class MDS;
+class CDir;
+class Context;
+
+class MDStore {
+ protected:
+  MDS *mds;
+
+
+ public:
+  MDStore(MDS *m) {
+    mds = m;
+  }
+
+  
+  // fetch
+ public:
+  void fetch_dir( CDir *dir, Context *c );
+ protected:
+  void fetch_dir_2( int result, inodeno_t ino );
+  
+  void fetch_dir_hash( CDir *dir,
+                       Context *c,
+                       int hashcode = -1);
+  void fetch_dir_hash_2( bufferlist &bl,
+                         inode_t& inode,
+                         Context *c,
+                         int which);
+  friend class C_MDS_Fetch;
+  friend class C_MDS_FetchHash;
+
+  // commit
+ public:
+  void commit_dir( CDir *dir, Context *c );                      // commit current dir version to disk.
+  void commit_dir( CDir *dir, __uint64_t version, Context *c );  // commit specified version to disk
+ protected:
+  void commit_dir_2( int result, CDir *dir, __uint64_t committed_version );
+  
+  // low level committers
+  void commit_dir_slice( CDir *dir,
+                         Context *c,
+                         int hashcode = -1);
+  void commit_dir_slice_2( int result,
+                           CDir *dir,
+                           Context *c,
+                           __uint64_t version,
+                           int hashcode );
+  
+  friend class C_MDS_CommitDirFinish;
+  friend class C_MDS_CommitSlice;
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc
new file mode 100644
index 0000000000000..bde26ae72dced
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Migrator.cc
@@ -0,0 +1,3192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+#include "Locker.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDirWarning.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MHashDirDiscover.h"
+#include "messages/MHashDirDiscoverAck.h"
+#include "messages/MHashDirPrep.h"
+#include "messages/MHashDirPrepAck.h"
+#include "messages/MHashDir.h"
+#include "messages/MHashDirNotify.h"
+#include "messages/MHashDirAck.h"
+
+#include "messages/MUnhashDirPrep.h"
+#include "messages/MUnhashDirPrepAck.h"
+#include "messages/MUnhashDir.h"
+#include "messages/MUnhashDirAck.h"
+#include "messages/MUnhashDirNotify.h"
+#include "messages/MUnhashDirNotifyAck.h"
+
+
+
+void Migrator::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+    // import
+  case MSG_MDS_EXPORTDIRDISCOVER:
+    handle_export_dir_discover((MExportDirDiscover*)m);
+    break;
+  case MSG_MDS_EXPORTDIRPREP:
+    handle_export_dir_prep((MExportDirPrep*)m);
+    break;
+  case MSG_MDS_EXPORTDIR:
+    handle_export_dir((MExportDir*)m);
+    break;
+  case MSG_MDS_EXPORTDIRFINISH:
+    handle_export_dir_finish((MExportDirFinish*)m);
+    break;
+
+    // export 
+  case MSG_MDS_EXPORTDIRDISCOVERACK:
+    handle_export_dir_discover_ack((MExportDirDiscoverAck*)m);
+    break;
+  case MSG_MDS_EXPORTDIRPREPACK:
+    handle_export_dir_prep_ack((MExportDirPrepAck*)m);
+    break;
+  case MSG_MDS_EXPORTDIRNOTIFYACK:
+    handle_export_dir_notify_ack((MExportDirNotifyAck*)m);
+    break;    
+
+    // export 3rd party (inode authority)
+  case MSG_MDS_EXPORTDIRWARNING:
+    handle_export_dir_warning((MExportDirWarning*)m);
+    break;
+  case MSG_MDS_EXPORTDIRNOTIFY:
+    handle_export_dir_notify((MExportDirNotify*)m);
+    break;
+
+
+    // hashing
+  case MSG_MDS_HASHDIRDISCOVER:
+    handle_hash_dir_discover((MHashDirDiscover*)m);
+    break;
+  case MSG_MDS_HASHDIRDISCOVERACK:
+    handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m);
+    break;
+  case MSG_MDS_HASHDIRPREP:
+    handle_hash_dir_prep((MHashDirPrep*)m);
+    break;
+  case MSG_MDS_HASHDIRPREPACK:
+    handle_hash_dir_prep_ack((MHashDirPrepAck*)m);
+    break;
+  case MSG_MDS_HASHDIR:
+    handle_hash_dir((MHashDir*)m);
+    break;
+  case MSG_MDS_HASHDIRACK:
+    handle_hash_dir_ack((MHashDirAck*)m);
+    break;
+  case MSG_MDS_HASHDIRNOTIFY:
+    handle_hash_dir_notify((MHashDirNotify*)m);
+    break;
+
+    // unhashing
+  case MSG_MDS_UNHASHDIRPREP:
+    handle_unhash_dir_prep((MUnhashDirPrep*)m);
+    break;
+  case MSG_MDS_UNHASHDIRPREPACK:
+    handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m);
+    break;
+  case MSG_MDS_UNHASHDIR:
+    handle_unhash_dir((MUnhashDir*)m);
+    break;
+  case MSG_MDS_UNHASHDIRACK:
+    handle_unhash_dir_ack((MUnhashDirAck*)m);
+    break;
+  case MSG_MDS_UNHASHDIRNOTIFY:
+    handle_unhash_dir_notify((MUnhashDirNotify*)m);
+    break;
+  case MSG_MDS_UNHASHDIRNOTIFYACK:
+    handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m);
+    break;
+
+  default:
+    assert(0);
+  }
+}
+
+
+class C_MDC_EmptyImport : public Context {
+  Migrator *mig;
+  CDir *dir;
+public:
+  C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {}
+  void finish(int r) {
+    mig->export_empty_import(dir);
+  }
+};
+
+
+void Migrator::export_empty_import(CDir *dir)
+{
+  dout(7) << "export_empty_import " << *dir << endl;
+  
+  return;  // hack fixme
+
+  if (!dir->is_import()) {
+    dout(7) << "not import (anymore?)" << endl;
+    return;
+  }
+  if (dir->inode->is_root()) {
+    dout(7) << "root" << endl;
+    return;
+  }
+
+  if (dir->get_size() > 0) {
+    dout(7) << "not actually empty" << endl;
+    return;
+  }
+
+  // is it really empty?
+  if (!dir->is_complete()) {
+    dout(7) << "not complete, fetching." << endl;
+    mds->mdstore->fetch_dir(dir,
+                            new C_MDC_EmptyImport(this,dir));
+    return;
+  }
+
+  int dest = dir->inode->authority();
+
+  // comment this out ot wreak havoc?
+  //if (mds->is_shutting_down()) dest = 0;  // this is more efficient.
+  
+  dout(7) << "really empty, exporting to " << dest << endl;
+  assert (dest != mds->get_nodeid());
+  
+  dout(-7) << "exporting to mds" << dest 
+           << " empty import " << *dir << endl;
+  export_dir( dir, dest );
+}
+
+
+// ==========================================================
+// IMPORT/EXPORT
+
+
+class C_MDC_ExportFreeze : public Context {
+  Migrator *mig;
+  CDir *ex;   // dir i'm exporting
+  int dest;
+
+public:
+  C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) :
+	mig(m), ex(e), dest(d) {}
+  virtual void finish(int r) {
+    mig->export_dir_frozen(ex, dest);
+  }
+};
+
+
+
+/** export_dir(dir, dest)
+ * public method to initiate an export.
+ * will fail if the directory is freezing, frozen, unpinnable, or root. 
+ */
+void Migrator::export_dir(CDir *dir,
+                         int dest)
+{
+  dout(7) << "export_dir " << *dir << " to " << dest << endl;
+  assert(dest != mds->get_nodeid());
+  assert(!dir->is_hashed());
+   
+  if (dir->inode->is_root()) {
+    dout(7) << "i won't export root" << endl;
+    assert(0);
+    return;
+  }
+
+  if (dir->is_frozen() ||
+      dir->is_freezing()) {
+    dout(7) << " can't export, freezing|frozen.  wait for other exports to finish first." << endl;
+    return;
+  }
+  if (dir->is_hashed()) {
+    dout(7) << "can't export hashed dir right now.  implement me carefully later." << endl;
+    return;
+  }
+  
+
+  // pin path?
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  if (!cache->path_pin(trace, 0, 0)) {
+    dout(7) << "export_dir couldn't pin path, failing." << endl;
+    return;
+  }
+
+  // ok, let's go.
+
+  // send ExportDirDiscover (ask target)
+  export_gather[dir].insert(dest);
+  mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR);
+  dir->auth_pin();   // pin dir, to hang up our freeze  (unpin on prep ack)
+
+  // take away the popularity we're sending.   FIXME: do this later?
+  mds->balancer->subtract_export(dir);
+  
+  
+  // freeze the subtree
+  dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest));
+}
+
+
+/*
+ * called on receipt of MExportDirDiscoverAck
+ * the importer now has the directory's _inode_ in memory, and pinned.
+ */
+void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(export_gather[dir].count(from));
+  export_gather[dir].erase(from);
+
+  if (export_gather[dir].empty()) {
+    dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+    dir->auth_unpin();   // unpin to allow freeze to complete
+  } else {
+    dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl;
+  }
+  
+  delete m;  // done
+}
+
+
+void Migrator::export_dir_frozen(CDir *dir,
+                                int dest)
+{
+  // subtree is now frozen!
+  dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl;
+
+  show_imports();
+
+  MExportDirPrep *prep = new MExportDirPrep(dir->inode);
+
+  // include spanning tree for all nested exports.
+  // these need to be on the destination _before_ the final export so that
+  // dir_auth updates on any nested exports are properly absorbed.
+  
+  set<inodeno_t> inodes_added;
+  
+  // include base dir
+  prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) );
+  
+  // also include traces to all nested exports.
+  set<CDir*> my_nested;
+  cache->find_nested_exports(dir, my_nested);
+  for (set<CDir*>::iterator it = my_nested.begin();
+       it != my_nested.end();
+       it++) {
+    CDir *exp = *it;
+    
+    dout(7) << " including nested export " << *exp << " in prep" << endl;
+
+    prep->add_export( exp->ino() );
+
+    /* first assemble each trace, in trace order, and put in message */
+    list<CInode*> inode_trace;  
+
+    // trace to dir
+    CDir *cur = exp;
+    while (cur != dir) {
+      // don't repeat ourselves
+      if (inodes_added.count(cur->ino())) break;   // did already!
+      inodes_added.insert(cur->ino());
+      
+      CDir *parent_dir = cur->get_parent_dir();
+
+      // inode?
+      assert(cur->inode->is_auth());
+      inode_trace.push_front(cur->inode);
+      dout(7) << "  will add " << *cur->inode << endl;
+      
+      // include dir? note: this'll include everything except the nested exports themselves, 
+      // since someone else is obviously auth.
+      if (cur->is_auth()) {
+        prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) );  // yay!
+        dout(7) << "  added " << *cur << endl;
+      }
+      
+      cur = parent_dir;      
+    }
+
+    for (list<CInode*>::iterator it = inode_trace.begin();
+         it != inode_trace.end();
+         it++) {
+      CInode *in = *it;
+      dout(7) << "  added " << *in << endl;
+      prep->add_inode( in->parent->dir->ino(),
+                       in->parent->name,
+                       in->replicate_to(dest) );
+    }
+
+  }
+  
+  // send it!
+  mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR);
+}
+
+void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+
+  dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl;
+  
+  // start export.
+  export_dir_go(dir, MSG_ADDR_NUM(m->get_source()));
+
+  // done
+  delete m;
+}
+
+
+void Migrator::export_dir_go(CDir *dir,
+                            int dest)
+{  
+  dout(7) << "export_dir_go " << *dir << " to " << dest << endl;
+
+  show_imports();
+
+
+  // build export message
+  MExportDir *req = new MExportDir(dir->inode);  // include pop
+
+
+  // update imports/exports
+  CDir *containing_import = cache->get_auth_container(dir);
+
+  if (containing_import == dir) {
+    dout(7) << " i'm rexporting a previous import" << endl;
+    assert(dir->is_import());
+    cache->imports.erase(dir);
+    dir->state_clear(CDIR_STATE_IMPORT);
+    dir->put(CDIR_PIN_IMPORT);                  // unpin, no longer an import
+    
+    // discard nested exports (that we're handing off
+    for (set<CDir*>::iterator p = cache->nested_exports[dir].begin();
+         p != cache->nested_exports[dir].end(); ) {
+      CDir *nested = *p;
+      p++;
+
+      // add to export message
+      req->add_export(nested);
+      
+      // nested beneath our new export *in; remove!
+      dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
+      assert(cache->exports.count(nested) == 1);
+      cache->nested_exports[dir].erase(nested);
+    }
+    
+  } else {
+    dout(7) << " i'm a subdir nested under import " << *containing_import << endl;
+    cache->exports.insert(dir);
+    cache->nested_exports[containing_import].insert(dir);
+    
+    dir->state_set(CDIR_STATE_EXPORT);
+    dir->get(CDIR_PIN_EXPORT);                  // i must keep it pinned
+    
+    // discard nested exports (that we're handing off)
+    for (set<CDir*>::iterator p = cache->nested_exports[containing_import].begin();
+         p != cache->nested_exports[containing_import].end(); ) {
+      CDir *nested = *p;
+      p++;
+      if (nested == dir) continue;  // ignore myself
+      
+      // container of parent; otherwise we get ourselves.
+      CDir *containing_export = nested->get_parent_dir();
+      while (containing_export && !containing_export->is_export())
+        containing_export = containing_export->get_parent_dir();
+      if (!containing_export) continue;
+
+      if (containing_export == dir) {
+        // nested beneath our new export *in; remove!
+        dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl;
+        cache->nested_exports[containing_import].erase(nested);
+        // exports.erase(nested); _walk does this
+
+        // add to msg
+        req->add_export(nested);
+      } else {
+        dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
+        assert(cache->get_auth_container(containing_export) != containing_import);
+      }
+    }
+  }
+
+  // note new authority (locally)
+  if (dir->inode->authority() == dest)
+    dir->set_dir_auth( CDIR_AUTH_PARENT );
+  else
+    dir->set_dir_auth( dest );
+
+  // make list of nodes i expect an export_dir_notify_ack from
+  //  (everyone w/ this dir open, but me!)
+  assert(export_notify_ack_waiting[dir].empty());
+  for (set<int>::iterator it = dir->open_by.begin();
+       it != dir->open_by.end();
+       it++) {
+    if (*it == mds->get_nodeid()) continue;
+    export_notify_ack_waiting[dir].insert( *it );
+
+    // send warning to all but dest
+    if (*it != dest) {
+      dout(10) << " sending export_dir_warning to mds" << *it << endl;
+      mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR);
+    }
+  }
+  assert(export_notify_ack_waiting[dir].count( dest ));
+
+  // fill export message with cache data
+  C_Contexts *fin = new C_Contexts;
+  int num_exported_inodes = export_dir_walk( req, 
+                                             fin, 
+                                             dir,   // base
+                                             dir,   // recur start point
+                                             dest );
+  
+  // send the export data!
+  mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR);
+
+  // queue up the finisher
+  dir->add_waiter( CDIR_WAIT_UNFREEZE, fin );
+
+
+  // stats
+  mds->logger->inc("ex");
+  mds->logger->inc("iex", num_exported_inodes);
+
+  show_imports();
+}
+
+
+/** encode_export_inode
+ * update our local state for this inode to export.
+ * encode relevant state to be sent over the wire.
+ * used by: export_dir_walk, file_rename (if foreign)
+ */
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth)
+{
+  in->inode.version++;  // so local log entries are ignored, etc.  (FIXME ??)
+  
+  // tell (all) clients about migrating caps.. mark STALE
+  for (map<int, Capability>::iterator it = in->client_caps.begin();
+       it != in->client_caps.end();
+       it++) {
+    dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl;
+    MClientFileCaps *m = new MClientFileCaps(in->inode, 
+                                             it->second.get_last_seq(), 
+                                             it->second.pending(),
+                                             it->second.wanted(),
+                                             MClientFileCaps::FILECAP_STALE);
+    mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+				 0, MDS_PORT_CACHE);
+  }
+
+  // relax locks?
+  if (!in->is_cached_by_anyone())
+    in->replicate_relax_locks();
+
+  // add inode
+  assert(in->cached_by.count(mds->get_nodeid()) == 0);
+  CInodeExport istate( in );
+  istate._encode( enc_state );
+
+  // we're export this inode; fix inode state
+  dout(7) << "encode_export_inode " << *in << endl;
+  
+  if (in->is_dirty()) in->mark_clean();
+  
+  // clear/unpin cached_by (we're no longer the authority)
+  in->cached_by_clear();
+  
+  // twiddle lock states for auth -> replica transition
+  // hard
+  in->hardlock.clear_gather();
+  if (in->hardlock.get_state() == LOCK_GLOCKR)
+    in->hardlock.set_state(LOCK_LOCK);
+
+  // file : we lost all our caps, so move to stable state!
+  in->filelock.clear_gather();
+  if (in->filelock.get_state() == LOCK_GLOCKR ||
+      in->filelock.get_state() == LOCK_GLOCKM ||
+      in->filelock.get_state() == LOCK_GLOCKL ||
+      in->filelock.get_state() == LOCK_GLONERR ||
+      in->filelock.get_state() == LOCK_GLONERM ||
+      in->filelock.get_state() == LOCK_LONER)
+    in->filelock.set_state(LOCK_LOCK);
+  if (in->filelock.get_state() == LOCK_GMIXEDR)
+    in->filelock.set_state(LOCK_MIXED);
+  // this looks like a step backwards, but it's what we want!
+  if (in->filelock.get_state() == LOCK_GSYNCM)
+    in->filelock.set_state(LOCK_MIXED);
+  if (in->filelock.get_state() == LOCK_GSYNCL)
+    in->filelock.set_state(LOCK_LOCK);
+  if (in->filelock.get_state() == LOCK_GMIXEDL)
+    in->filelock.set_state(LOCK_LOCK);
+    //in->filelock.set_state(LOCK_MIXED);
+  
+  // mark auth
+  assert(in->is_auth());
+  in->set_auth(false);
+  in->replica_nonce = CINODE_EXPORT_NONCE;
+  
+  // *** other state too?
+
+  // move to end of LRU so we drop out of cache quickly!
+  cache->lru.lru_bottouch(in);
+}
+
+
+int Migrator::export_dir_walk(MExportDir *req,
+                             C_Contexts *fin,
+                             CDir *basedir,
+                             CDir *dir,
+                             int newauth)
+{
+  int num_exported = 0;
+
+  dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl;
+  
+  // dir 
+  bufferlist enc_dir;
+  
+  CDirExport dstate(dir);
+  dstate._encode( enc_dir );
+  
+  // release open_by 
+  dir->open_by_clear();
+
+  // mark
+  assert(dir->is_auth());
+  dir->state_clear(CDIR_STATE_AUTH);
+  dir->replica_nonce = CDIR_NONCE_EXPORT;
+
+  // proxy
+  dir->state_set(CDIR_STATE_PROXY);
+  dir->get(CDIR_PIN_PROXY);
+  export_proxy_dirinos[basedir].push_back(dir->ino());
+
+  list<CDir*> subdirs;
+
+  if (dir->is_hashed()) {
+    // fix state
+    dir->state_clear( CDIR_STATE_AUTH );
+
+  } else {
+    
+    if (dir->is_dirty())
+      dir->mark_clean();
+    
+    // discard most dir state
+    dir->state &= CDIR_MASK_STATE_EXPORT_KEPT;  // i only retain a few things.
+    
+    // suck up all waiters
+    list<Context*> waiting;
+    dir->take_waiting(CDIR_WAIT_ANY, waiting);    // all dir waiters
+    fin->take(waiting);
+    
+    // inodes
+    
+    CDir_map_t::iterator it;
+    for (it = dir->begin(); it != dir->end(); it++) {
+      CDentry *dn = it->second;
+      CInode *in = dn->inode;
+      
+      num_exported++;
+      
+      // -- dentry
+      dout(7) << "export_dir_walk exporting " << *dn << endl;
+      _encode(it->first, enc_dir);
+      
+      if (dn->is_dirty()) 
+        enc_dir.append("D", 1);  // dirty
+      else 
+        enc_dir.append("C", 1);  // clean
+      
+      // null dentry?
+      if (dn->is_null()) {
+        enc_dir.append("N", 1);  // null dentry
+        assert(dn->is_sync());
+        continue;
+      }
+      
+      if (dn->is_remote()) {
+        // remote link
+        enc_dir.append("L", 1);  // remote link
+        
+        inodeno_t ino = dn->get_remote_ino();
+        enc_dir.append((char*)&ino, sizeof(ino));
+        continue;
+      }
+      
+      // primary link
+      // -- inode
+      enc_dir.append("I", 1);    // inode dentry
+      
+      encode_export_inode(in, enc_dir, newauth);  // encode, and (update state for) export
+      
+      // directory?
+      if (in->is_dir() && in->dir) { 
+        if (in->dir->is_auth()) {
+          // nested subdir
+          assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+          subdirs.push_back(in->dir);  // it's ours, recurse (later)
+          
+        } else {
+          // nested export
+          assert(in->dir->get_dir_auth() >= 0);
+          dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl;
+          assert(cache->exports.count(in->dir) == 1); 
+          cache->exports.erase(in->dir);                    // discard nested export   (nested_exports updated above)
+          
+          in->dir->state_clear(CDIR_STATE_EXPORT);
+          in->dir->put(CDIR_PIN_EXPORT);
+          
+          // simplify dir_auth?
+          if (in->dir->get_dir_auth() == newauth)
+            in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+        } 
+      }
+      
+      // add to proxy
+      export_proxy_inos[basedir].push_back(in->ino());
+      in->state_set(CINODE_STATE_PROXY);
+      in->get(CINODE_PIN_PROXY);
+      
+      // waiters
+      list<Context*> waiters;
+      in->take_waiting(CINODE_WAIT_ANY, waiters);
+      fin->take(waiters);
+    }
+  }
+
+  req->add_dir( enc_dir );
+
+  // subdirs
+  for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
+    num_exported += export_dir_walk(req, fin, basedir, *it, newauth);
+
+  return num_exported;
+}
+
+
+/*
+ * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack)
+ */
+void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m)
+{
+  CInode *diri = cache->get_inode(m->get_ino());
+  CDir *dir = diri->dir;
+  assert(dir);
+  assert(dir->is_frozen_tree_root());  // i'm exporting!
+
+  // remove from waiting list
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(export_notify_ack_waiting[dir].count(from));
+  export_notify_ack_waiting[dir].erase(from);
+
+  // done?
+  if (!export_notify_ack_waiting[dir].empty()) {
+    dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from 
+            << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
+    
+  } else {
+    dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from 
+            << ", last one!" << endl;
+
+    // ok, we're finished!
+    export_notify_ack_waiting.erase(dir);
+
+    // finish export  (unfreeze, trigger finish context, etc.)
+    export_dir_finish(dir);
+
+    // unpin proxies
+    // inodes
+    for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
+         it != export_proxy_inos[dir].end();
+         it++) {
+      CInode *in = cache->get_inode(*it);
+      in->put(CINODE_PIN_PROXY);
+      assert(in->state_test(CINODE_STATE_PROXY));
+      in->state_clear(CINODE_STATE_PROXY);
+    }
+    export_proxy_inos.erase(dir);
+
+    // dirs
+    for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
+         it != export_proxy_dirinos[dir].end();
+         it++) {
+      CDir *dir = cache->get_inode(*it)->dir;
+      dir->put(CDIR_PIN_PROXY);
+      assert(dir->state_test(CDIR_STATE_PROXY));
+      dir->state_clear(CDIR_STATE_PROXY);
+
+      // hose neg dentries, too, since we're no longer auth
+      CDir_map_t::iterator it;
+      for (it = dir->begin(); it != dir->end(); ) {
+        CDentry *dn = it->second;
+        it++;
+        if (dn->is_null()) {
+          assert(dn->is_sync());
+          dir->remove_dentry(dn);
+        } else {
+          //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
+          if (dn->is_dirty())
+            dn->mark_clean();
+        }
+      }
+    }
+    export_proxy_dirinos.erase(dir);
+
+  }
+
+  delete m;
+}
+
+
+/*
+ * once i get all teh notify_acks i can finish
+ */
+void Migrator::export_dir_finish(CDir *dir)
+{
+  // exported!
+
+  
+  // FIXME log it
+  
+  // send finish to new auth
+  mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+  
+  // unfreeze
+  dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl;
+  dir->unfreeze_tree();
+
+  // unpin path
+  dout(7) << "export_dir_finish unpinning path" << endl;
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  cache->path_unpin(trace, 0);
+
+
+  // stats
+  mds->logger->set("nex", cache->exports.size());
+
+  show_imports();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+//  IMPORTS
+
+class C_MDC_ExportDirDiscover : public Context {
+  Migrator *mig;
+  MExportDirDiscover *m;
+public:
+  vector<CDentry*> trace;
+  C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) :
+	mig(mig_), m(m_) {}
+  void finish(int r) {
+    CInode *in = 0;
+    if (r >= 0) in = trace[trace.size()-1]->get_inode();
+    mig->handle_export_dir_discover_2(m, in, r);
+  }
+};  
+
+void Migrator::handle_export_dir_discover(MExportDirDiscover *m)
+{
+  assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+  dout(7) << "handle_export_dir_discover on " << m->get_path() << endl;
+
+  // must discover it!
+  C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m);
+  filepath fpath(m->get_path());
+  cache->path_traverse(fpath, onfinish->trace, true,
+		       m, new C_MDS_RetryMessage(mds,m),       // on delay/retry
+		       MDS_TRAVERSE_DISCOVER,
+		       onfinish);  // on completion|error
+}
+
+void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r)
+{
+  // yay!
+  if (in) {
+    dout(7) << "handle_export_dir_discover_2 has " << *in << endl;
+  }
+
+  if (r < 0 || !in->is_dir()) {
+    dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+
+    assert(0);    // this shouldn't happen if the auth pins his path properly!!!! 
+
+    mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false),
+			  m->get_source().num(), MDS_PORT_MIGRATOR);    
+    delete m;
+    return;
+  }
+  
+  assert(in->is_dir());
+
+  if (in->is_frozen()) {
+    dout(7) << "frozen, waiting." << endl;
+    in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+                   new C_MDS_RetryMessage(mds,m));
+    return;
+  }
+  
+  // pin inode in the cache (for now)
+  in->get(CINODE_PIN_IMPORTING);
+  
+  // pin auth too, until the import completes.
+  in->auth_pin();
+  
+  // reply
+  dout(7) << " sending export_dir_discover_ack on " << *in << endl;
+  mds->send_message_mds(new MExportDirDiscoverAck(in->ino()),
+			m->get_source().num(), MDS_PORT_MIGRATOR);
+  delete m;
+}
+
+
+
+void Migrator::handle_export_dir_prep(MExportDirPrep *m)
+{
+  assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+  CInode *diri = cache->get_inode(m->get_ino());
+  assert(diri);
+
+  list<Context*> finished;
+
+  // assimilate root dir.
+  CDir *dir = diri->dir;
+  if (dir) {
+    dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl;
+
+    if (!m->did_assim())
+      m->get_dir(diri->ino())->update_dir(dir);
+  } else {
+    assert(!m->did_assim());
+
+    // open dir i'm importing.
+    diri->set_dir( new CDir(diri, mds, false) );
+    dir = diri->dir;
+    m->get_dir(diri->ino())->update_dir(dir);
+    
+    dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl;
+
+    diri->take_waiting(CINODE_WAIT_DIR, finished);
+  }
+  assert(dir->is_auth() == false);
+  
+  show_imports();
+
+  // assimilate contents?
+  if (!m->did_assim()) {
+    dout(7) << "doing assim on " << *dir << endl;
+    m->mark_assim();  // only do this the first time!
+
+    // move pin to dir
+    diri->put(CINODE_PIN_IMPORTING);
+    dir->get(CDIR_PIN_IMPORTING);  
+
+    // auth pin too
+    dir->auth_pin();
+    diri->auth_unpin();
+    
+    // assimilate traces to exports
+    for (list<CInodeDiscover*>::iterator it = m->get_inodes().begin();
+         it != m->get_inodes().end();
+         it++) {
+      // inode
+      CInode *in = cache->get_inode( (*it)->get_ino() );
+      if (in) {
+        (*it)->update_inode(in);
+        dout(7) << " updated " << *in << endl;
+      } else {
+        in = new CInode(mds->mdcache, false);
+        (*it)->update_inode(in);
+        
+        // link to the containing dir
+        CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) );
+        assert(condiri && condiri->dir);
+		cache->add_inode( in );
+        condiri->dir->add_dentry( m->get_dentry(in->ino()), in );
+        
+        dout(7) << "   added " << *in << endl;
+      }
+      
+      assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) );
+      
+      // dir
+      if (m->have_dir(in->ino())) {
+        if (in->dir) {
+          m->get_dir(in->ino())->update_dir(in->dir);
+          dout(7) << " updated " << *in->dir << endl;
+        } else {
+          in->set_dir( new CDir(in, mds, false) );
+          m->get_dir(in->ino())->update_dir(in->dir);
+          dout(7) << "   added " << *in->dir << endl;
+          in->take_waiting(CINODE_WAIT_DIR, finished);
+        }
+      }
+    }
+
+    // open export dirs?
+    for (list<inodeno_t>::iterator it = m->get_exports().begin();
+         it != m->get_exports().end();
+         it++) {
+      dout(7) << "  checking dir " << hex << *it << dec << endl;
+      CInode *in = cache->get_inode(*it);
+      assert(in);
+      
+      if (!in->dir) {
+        dout(7) << "  opening nested export on " << *in << endl;
+        cache->open_remote_dir(in,
+			       new C_MDS_RetryMessage(mds, m));
+
+        // pin it!
+        in->get(CINODE_PIN_OPENINGDIR);
+        in->state_set(CINODE_STATE_OPENINGDIR);
+      }
+    }
+  } else {
+    dout(7) << " not doing assim on " << *dir << endl;
+  }
+  
+
+  // verify we have all exports
+  int waiting_for = 0;
+  for (list<inodeno_t>::iterator it = m->get_exports().begin();
+       it != m->get_exports().end();
+       it++) {
+    inodeno_t ino = *it;
+    CInode *in = cache->get_inode(ino);
+    if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl;
+    assert(in);
+    if (in->dir) {
+      if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+        dout(7) << "  pinning nested export " << *in->dir << endl;
+        in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+        in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+
+        if (in->state_test(CINODE_STATE_OPENINGDIR)) {
+          in->put(CINODE_PIN_OPENINGDIR);
+          in->state_clear(CINODE_STATE_OPENINGDIR);
+        }
+      } else {
+        dout(7) << "  already pinned nested export " << *in << endl;
+      }
+    } else {
+      dout(7) << "  waiting for nested export dir on " << *in << endl;
+      waiting_for++;
+    }
+  }
+  if (waiting_for) {
+    dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl;
+  } else {
+    // ok!
+    dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl;
+    mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
+			  m->get_source().num(), MDS_PORT_MIGRATOR);
+    
+    // done 
+    delete m;
+  }
+
+  // finish waiters
+  finish_contexts(finished, 0);
+}
+
+
+
+
+/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish.
+ * if it's the last one on the dir, it reprocessed the import.
+ */
+/*
+class C_MDS_ImportPrediscover : public Context {
+public:
+  MDS *mds;
+  MExportDir *m;
+  inodeno_t dir_ino;
+  string dentry;
+  C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) {
+    this->mds = mds;
+    this->m = m;
+    this->dir_ino = dir_ino;
+    this->dentry = dentry;
+  }
+  virtual void finish(int r) {
+    assert(r == 0);  // should never fail!
+    
+    m->remove_prediscover(dir_ino, dentry);
+    
+    if (!m->any_prediscovers()) 
+      mds->mdcache->handle_export_dir(m);
+  }
+};
+*/
+
+
+
+void Migrator::handle_export_dir(MExportDir *m)
+{
+  CInode *diri = cache->get_inode(m->get_ino());
+  assert(diri);
+  CDir *dir = diri->dir;
+  assert(dir);
+
+  int oldauth = MSG_ADDR_NUM(m->get_source());
+  dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl;
+  assert(dir->is_auth() == false);
+
+
+
+  show_imports();
+  
+  // note new authority (locally)
+  if (dir->inode->is_auth())
+    dir->set_dir_auth( CDIR_AUTH_PARENT );
+  else
+    dir->set_dir_auth( mds->get_nodeid() );
+  dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl;
+
+  // update imports/exports
+  CDir *containing_import;
+  if (cache->exports.count(dir)) {
+    // reimporting
+    dout(7) << " i'm reimporting " << *dir << endl;
+    cache->exports.erase(dir);
+
+    dir->state_clear(CDIR_STATE_EXPORT);
+    dir->put(CDIR_PIN_EXPORT);                // unpin, no longer an export
+    
+    containing_import = cache->get_auth_container(dir);  
+    dout(7) << "  it is nested under import " << *containing_import << endl;
+    cache->nested_exports[containing_import].erase(dir);
+  } else {
+    // new import
+    cache->imports.insert(dir);
+    dir->state_set(CDIR_STATE_IMPORT);
+    dir->get(CDIR_PIN_IMPORT);                // must keep it pinned
+    
+    containing_import = dir;  // imported exports nested under *in
+
+    dout(7) << " new import at " << *dir << endl;
+  }
+
+
+  // take out my temp pin
+  dir->put(CDIR_PIN_IMPORTING);
+
+  // add any inherited exports
+  for (list<inodeno_t>::iterator it = m->get_exports().begin();
+       it != m->get_exports().end();
+       it++) {
+    CInode *exi = cache->get_inode(*it);
+    assert(exi && exi->dir);
+    CDir *ex = exi->dir;
+
+    dout(15) << " nested export " << *ex << endl;
+
+    // remove our pin
+    ex->put(CDIR_PIN_IMPORTINGEXPORT);
+    ex->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+
+    // add...
+    if (ex->is_import()) {
+      dout(7) << " importing my import " << *ex << endl;
+      cache->imports.erase(ex);
+      ex->state_clear(CDIR_STATE_IMPORT);
+
+      mds->logger->inc("imex");
+
+      // move nested exports under containing_import
+      for (set<CDir*>::iterator it = cache->nested_exports[ex].begin();
+           it != cache->nested_exports[ex].end();
+           it++) {
+        dout(7) << "     moving nested export " << **it << " under " << *containing_import << endl;
+        cache->nested_exports[containing_import].insert(*it);
+      }
+      cache->nested_exports.erase(ex);          // de-list under old import
+      
+      ex->set_dir_auth( CDIR_AUTH_PARENT );
+      ex->put(CDIR_PIN_IMPORT);       // imports are pinned, no longer import
+
+    } else {
+      dout(7) << " importing export " << *ex << endl;
+
+      // add it
+      ex->state_set(CDIR_STATE_EXPORT);
+      ex->get(CDIR_PIN_EXPORT);           // all exports are pinned
+      cache->exports.insert(ex);
+      cache->nested_exports[containing_import].insert(ex);
+      mds->logger->inc("imex");
+    }
+    
+  }
+
+
+  // add this crap to my cache
+  list<inodeno_t> imported_subdirs;
+  bufferlist dir_state;
+  dir_state.claim( m->get_state() );
+  int off = 0;
+  int num_imported_inodes = 0;
+
+  for (int i = 0; i < m->get_ndirs(); i++) {
+    num_imported_inodes += 
+      import_dir_block(dir_state, 
+                       off,
+                       oldauth, 
+                       dir,                 // import root
+                       imported_subdirs);
+  }
+  dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
+  dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
+  
+
+  // adjust popularity
+  mds->balancer->add_import(dir);
+
+  // send notify's etc.
+  dout(7) << "sending notifyack for " << *dir << " to old auth " << MSG_ADDR_NUM(m->get_source()) << endl;
+  mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
+			m->get_source().num(), MDS_PORT_MIGRATOR);
+
+  dout(7) << "sending notify to others" << endl;
+  for (set<int>::iterator it = dir->open_by.begin();
+       it != dir->open_by.end();
+       it++) {
+    assert( *it != mds->get_nodeid() );
+    if ( *it == MSG_ADDR_NUM(m->get_source()) ) continue;  // not to old auth.
+
+    MExportDirNotify *notify = new MExportDirNotify(dir->ino(), MSG_ADDR_NUM(m->get_source()), mds->get_nodeid());
+    notify->copy_exports(m->get_exports());
+
+    if (g_conf.mds_verify_export_dirauth)
+      notify->copy_subdirs(imported_subdirs);   // copy subdir list (DEBUG)
+
+    mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR);
+  }
+  
+  // done
+  delete m;
+
+  show_imports();
+
+
+  // is it empty?
+  if (dir->get_size() == 0 &&
+      !dir->inode->is_auth()) {
+    // reexport!
+    export_empty_import(dir);
+  }
+
+
+  // some stats
+  mds->logger->inc("im");
+  mds->logger->inc("iim", num_imported_inodes);
+  mds->logger->set("nim", cache->imports.size());
+
+
+  // FIXME LOG IT
+
+  /*
+    stupid hashing crap, FIXME
+
+  // wait for replicas in hashed dirs?
+  if (import_hashed_replicate_waiting.count(m->get_ino())) {
+    // it'll happen later!, when i get my inodegetreplicaack's back
+  } else {
+    // finish now
+    //not anymoreimport_dir_finish(dir);
+  }
+  */
+
+}
+
+
+
+void Migrator::handle_export_dir_finish(MExportDirFinish *m)
+{
+  CInode *diri = cache->get_inode(m->get_ino());
+  CDir *dir = diri->dir;
+  assert(dir);
+
+  dout(7) << "handle_export_dir_finish on " << *dir << endl;
+  assert(dir->is_auth());
+
+  dout(5) << "done with import of " << *dir << endl;
+  show_imports();
+  mds->logger->set("nex", cache->exports.size());
+  mds->logger->set("nim", cache->imports.size());
+
+  // un auth pin (other exports can now proceed)
+  dir->auth_unpin();  
+  
+  // ok now finish contexts
+  dout(5) << "finishing any waiters on imported data" << endl;
+  dir->finish_waiting(CDIR_WAIT_IMPORTED);
+
+  delete m;
+}
+
+
+void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth)
+{  
+  CInodeExport istate;
+  off = istate._decode(bl, off);
+  dout(15) << "got a cinodeexport " << endl;
+  
+  bool added = false;
+  CInode *in = cache->get_inode(istate.get_ino());
+  if (!in) {
+    in = new CInode(mds->mdcache);
+    added = true;
+  } else {
+    in->set_auth(true);
+  }
+
+  // link before state
+  if (dn->inode != in) {
+    assert(!dn->inode);
+    dn->dir->link_inode(dn, in);
+  }
+
+  // state after link
+  set<int> merged_client_caps;
+  istate.update_inode(in, merged_client_caps);
+ 
+ 
+  // add inode?
+  if (added) {
+    cache->add_inode(in);
+    dout(10) << "added " << *in << endl;
+  } else {
+    dout(10) << "  had " << *in << endl;
+  }
+  
+  
+  // cached_by
+  assert(!in->is_cached_by(oldauth));
+  in->cached_by_add( oldauth, CINODE_EXPORT_NONCE );
+  if (in->is_cached_by(mds->get_nodeid()))
+    in->cached_by_remove(mds->get_nodeid());
+  
+  // twiddle locks
+  // hard
+  if (in->hardlock.get_state() == LOCK_GLOCKR) {
+    in->hardlock.gather_set.erase(mds->get_nodeid());
+    in->hardlock.gather_set.erase(oldauth);
+    if (in->hardlock.gather_set.empty())
+      mds->locker->inode_hard_eval(in);
+  }
+
+  // caps
+  for (set<int>::iterator it = merged_client_caps.begin();
+       it != merged_client_caps.end();
+       it++) {
+    MClientFileCaps *caps = new MClientFileCaps(in->inode,
+                                                in->client_caps[*it].get_last_seq(),
+                                                in->client_caps[*it].pending(),
+                                                in->client_caps[*it].wanted(),
+                                                MClientFileCaps::FILECAP_REAP);
+    caps->set_mds( oldauth ); // reap from whom?
+    mds->messenger->send_message(caps, 
+				 MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it),
+				 0, MDS_PORT_CACHE);
+  }
+
+  // filelock
+  if (!in->filelock.is_stable()) {
+    // take me and old auth out of gather set
+    in->filelock.gather_set.erase(mds->get_nodeid());
+    in->filelock.gather_set.erase(oldauth);
+    if (in->filelock.gather_set.empty())  // necessary but not suffient...
+      mds->locker->inode_file_eval(in);    
+  }
+
+  // other
+  if (in->is_dirty()) {
+    dout(10) << "logging dirty import " << *in << endl;
+    mds->mdlog->submit_entry(new EInodeUpdate(in));
+  }
+}
+
+
+int Migrator::import_dir_block(bufferlist& bl,
+                              int& off,
+                              int oldauth,
+                              CDir *import_root,
+                              list<inodeno_t>& imported_subdirs)
+{
+  // set up dir
+  CDirExport dstate;
+  off = dstate._decode(bl, off);
+
+  CInode *diri = cache->get_inode(dstate.get_ino());
+  assert(diri);
+  CDir *dir = diri->get_or_open_dir(mds);
+  assert(dir);
+ 
+  dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl;
+
+  // add to list
+  if (dir != import_root)
+    imported_subdirs.push_back(dir->ino());
+
+  // assimilate state
+  dstate.update_dir( dir );
+  if (diri->is_auth()) 
+    dir->set_dir_auth( CDIR_AUTH_PARENT );   // update_dir may hose dir_auth
+
+  // mark  (may already be marked from get_or_open_dir() above)
+  if (!dir->is_auth())
+    dir->state_set(CDIR_STATE_AUTH);
+
+  // open_by
+  assert(!dir->is_open_by(oldauth));
+  dir->open_by_add(oldauth);
+  if (dir->is_open_by(mds->get_nodeid()))
+    dir->open_by_remove(mds->get_nodeid());
+
+  if (dir->is_hashed()) {
+
+    // do nothing; dir is hashed
+    return 0;
+  } else {
+    // take all waiters on this dir
+    // NOTE: a pass of imported data is guaranteed to get all of my waiters because
+    // a replica's presense in my cache implies/forces it's presense in authority's.
+    list<Context*> waiters;
+    
+    dir->take_waiting(CDIR_WAIT_ANY, waiters);
+    for (list<Context*>::iterator it = waiters.begin();
+         it != waiters.end();
+         it++) 
+      import_root->add_waiter(CDIR_WAIT_IMPORTED, *it);
+    
+    dout(15) << "doing contents" << endl;
+    
+    // contents
+    int num_imported = 0;
+    long nden = dstate.get_nden();
+
+    for (; nden>0; nden--) {
+      
+      num_imported++;
+      
+      // dentry
+      string dname;
+      _decode(dname, bl, off);
+      dout(15) << "dname is " << dname << endl;
+      
+      char dirty;
+      bl.copy(off, 1, &dirty);
+      off++;
+      
+      char icode;
+      bl.copy(off, 1, &icode);
+      off++;
+      
+      CDentry *dn = dir->lookup(dname);
+      if (!dn)
+        dn = dir->add_dentry(dname);  // null
+      
+      // mark dn dirty _after_ we link the inode (scroll down)
+      
+      if (icode == 'N') {
+        // null dentry
+        assert(dn->is_null());  
+        
+        // fall thru
+      }
+      else if (icode == 'L') {
+        // remote link
+        inodeno_t ino;
+        bl.copy(off, sizeof(ino), (char*)&ino);
+        off += sizeof(ino);
+        dir->link_inode(dn, ino);
+      }
+      else if (icode == 'I') {
+        // inode
+        decode_import_inode(dn, bl, off, oldauth);
+      }
+      
+      // mark dentry dirty?  (only _after_ we link the inode!)
+      if (dirty == 'D') dn->mark_dirty();
+      
+    }
+
+    if (dir->is_dirty()) 
+      mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+    return num_imported;
+  }
+}
+
+
+
+
+
+// authority bystander
+
+void Migrator::handle_export_dir_warning(MExportDirWarning *m)
+{
+  // add to warning list
+  stray_export_warnings.insert( m->get_ino() );
+  
+  // did i already see the notify?
+  if (stray_export_notifies.count(m->get_ino())) {
+    // i did, we're good.
+    dout(7) << "handle_export_dir_warning on " << m->get_ino() << ".  already got notify." << endl;
+    
+    // process the notify
+    map<inodeno_t, MExportDirNotify*>::iterator it = stray_export_notifies.find(m->get_ino());
+    handle_export_dir_notify(it->second);
+    stray_export_notifies.erase(it);
+  } else {
+    dout(7) << "handle_export_dir_warning on " << m->get_ino() << ".  waiting for notify." << endl;
+  }
+  
+  // done
+  delete m;
+}
+
+
+void Migrator::handle_export_dir_notify(MExportDirNotify *m)
+{
+  CDir *dir = 0;
+  CInode *in = cache->get_inode(m->get_ino());
+  if (in) dir = in->dir;
+
+  // did i see the warning yet?
+  if (!stray_export_warnings.count(m->get_ino())) {
+    // wait for it.
+    dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl;
+    stray_export_notifies.insert(pair<inodeno_t, MExportDirNotify*>( m->get_ino(), m ));
+    return;
+  }
+
+  // i did, we're all good.
+  dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl;
+  
+  // update dir_auth!
+  if (dir) {
+    dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl;
+
+    // update bounds first
+    for (list<inodeno_t>::iterator it = m->get_exports().begin();
+         it != m->get_exports().end();
+         it++) {
+      CInode *n = cache->get_inode(*it);
+      if (!n) continue;
+      CDir *ndir = n->dir;
+      if (!ndir) continue;
+
+      int boundauth = ndir->authority();
+      dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl;
+      if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) {
+        if (boundauth != m->get_new_auth())
+          ndir->set_dir_auth( boundauth );
+        else assert(dir->authority() == m->get_new_auth());  // apparently we already knew!
+      } else {
+        if (boundauth == m->get_new_auth())
+          ndir->set_dir_auth( CDIR_AUTH_PARENT );
+      }
+    }
+    
+    // update dir_auth
+    if (in->authority() == m->get_new_auth()) {
+      dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl;
+      dir->set_dir_auth( CDIR_AUTH_PARENT );
+      assert(!in->is_auth());
+      assert(!dir->is_auth());
+    } else {
+      dir->set_dir_auth( m->get_new_auth() );
+    }
+    assert(dir->authority() != mds->get_nodeid());
+    assert(!dir->is_auth());
+    
+    // DEBUG: verify subdirs
+    if (g_conf.mds_verify_export_dirauth) {
+      
+      dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl;
+      for (list<inodeno_t>::iterator it = m->subdirs_begin();
+           it != m->subdirs_end();
+           it++) {
+        CInode *diri = cache->get_inode(*it);
+        if (!diri) continue;  // don't have it, don't care
+        if (!diri->dir) continue;
+        dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl;
+        assert(diri->dir != dir);      // base shouldn't be in subdir list
+        if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) {
+          dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl;
+          assert(0);  // bad news!
+          //dir->set_dir_auth( CDIR_AUTH_PARENT );
+        }
+        assert(diri->dir->authority() == m->get_new_auth());
+      }
+    }
+  }
+  
+  // send notify ack to old auth
+  dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl;
+  mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()),
+			m->get_old_auth(), MDS_PORT_MIGRATOR);
+  
+
+  // done
+  stray_export_warnings.erase( m->get_ino() );
+  delete m;
+}
+
+
+
+
+
+// =======================================================================
+// HASHING
+
+
+void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
+{
+  int off = 0;
+  
+  for (; nden>0; nden--) {
+    // dentry
+    string dname;
+    _decode(dname, bl, off);
+    dout(15) << "dname is " << dname << endl;
+    
+    char icode;
+    bl.copy(off, 1, &icode);
+    off++;
+    
+    CDentry *dn = dir->lookup(dname);
+    if (!dn)
+      dn = dir->add_dentry(dname);  // null
+    
+    // mark dn dirty _after_ we link the inode (scroll down)
+    
+    if (icode == 'N') {
+      
+      // null dentry
+      assert(dn->is_null());  
+      
+      // fall thru
+    }
+    else if (icode == 'L') {
+      // remote link
+      inodeno_t ino;
+      bl.copy(off, sizeof(ino), (char*)&ino);
+      off += sizeof(ino);
+      dir->link_inode(dn, ino);
+    }
+    else if (icode == 'I') {
+      // inode
+      decode_import_inode(dn, bl, off, oldauth);
+      
+      // fix up subdir export?
+      if (dn->inode->dir) {
+        assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
+        dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT);
+        dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+        if (dn->inode->dir->is_auth()) {
+          // mine.  must have been an import.
+          assert(dn->inode->dir->is_import());
+          dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
+          dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+          cache->imports.erase(dn->inode->dir);
+          dn->inode->dir->put(CDIR_PIN_IMPORT);
+          dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
+          
+          // move nested under hashdir
+          for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
+               it != cache->nested_exports[dn->inode->dir].end();
+               it++) 
+            cache->nested_exports[dir].insert(*it);
+          cache->nested_exports.erase(dn->inode->dir);
+
+          // now it matches the inode
+          dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+        }
+        else {
+          // not mine.  make it an export.
+          dout(7) << "making subdir into export " << *dn->inode->dir << endl;
+          dn->inode->dir->get(CDIR_PIN_EXPORT);
+          dn->inode->dir->state_set(CDIR_STATE_EXPORT);
+          cache->exports.insert(dn->inode->dir);
+          cache->nested_exports[dir].insert(dn->inode->dir);
+          
+          if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT)
+            dn->inode->dir->set_dir_auth( oldauth );          // no longer matches inode
+          assert(dn->inode->dir->get_dir_auth() >= 0);
+        }
+      }
+    }
+    
+    // mark dentry dirty?  (only _after_ we link the inode!)
+    dn->mark_dirty();
+  }
+}
+
+/*
+ 
+ notes on interaction of hashing and export/import:
+
+  - dir->is_auth() is completely independent of hashing.  for a hashed dir,
+     - all nodes are partially authoritative
+     - all nodes dir->is_hashed() == true
+     - all nodes dir->inode->dir_is_hashed() == true
+     - one node dir->is_auth() == true, the rest == false
+  - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
+
+  - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
+
+  - export_dir_walk and import_dir_block take care with dir_auth:   (for import/export)
+     - on export, -1 is changed to mds->get_nodeid()
+     - on import, nothing special, actually.
+
+  - hashed dir files aren't included in export; subdirs are converted to imports 
+    or exports as necessary.
+  - hashed dir subdirs are discovered on export. this is important
+    because dirs are needed to tie together auth hierarchy, for auth to know about
+    imports/exports, etc.
+
+  - dir state is maintained on auth.
+    - COMPLETE and HASHED are transfered to importers.
+    - DIRTY is set everywhere.
+
+  - hashed dir is like an import: hashed dir used for nested_exports map.
+    - nested_exports is updated appropriately on auth and replicas.
+    - a subtree terminates as a hashed dir, since the hashing explicitly
+      redelegates all inodes.  thus export_dir_walk includes hashed dirs, but 
+      not their inodes.
+*/
+
+// HASH on auth
+
+class C_MDC_HashFreeze : public Context {
+public:
+  Migrator *mig;
+  CDir *dir;
+  C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+  virtual void finish(int r) {
+    mig->hash_dir_frozen(dir);
+  }
+};
+
+class C_MDC_HashComplete : public Context {
+public:
+  Migrator *mig;
+  CDir *dir;
+  C_MDC_HashComplete(Migrator *mig, CDir *dir) {
+    this->mig = mig;
+    this->dir = dir;
+  }
+  virtual void finish(int r) {
+    mig->hash_dir_complete(dir);
+  }
+};
+
+
+/** hash_dir(dir)
+ * start hashing a directory.
+ */
+void Migrator::hash_dir(CDir *dir)
+{
+  dout(-7) << "hash_dir " << *dir << endl;
+
+  assert(!dir->is_hashed());
+  assert(dir->is_auth());
+  
+  if (dir->is_frozen() ||
+      dir->is_freezing()) {
+    dout(7) << " can't hash, freezing|frozen." << endl;
+    return;
+  }
+
+  // pin path?
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  if (!cache->path_pin(trace, 0, 0)) {
+    dout(7) << "hash_dir couldn't pin path, failing." << endl;
+    return;
+  }
+
+  // ok, go
+  dir->state_set(CDIR_STATE_HASHING);
+  dir->get(CDIR_PIN_HASHING);
+  assert(dir->hashed_subset.empty());
+
+  // discover on all mds
+  assert(hash_gather.count(dir) == 0);
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;  // except me
+    hash_gather[dir].insert(i);
+    mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
+  }
+  dir->auth_pin();  // pin until discovers are all acked.
+  
+  // start freeze
+  dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
+
+  // make complete
+  if (!dir->is_complete()) {
+    dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
+    mds->mdstore->fetch_dir(dir,
+                            new C_MDC_HashComplete(this, dir));
+  } else
+    hash_dir_complete(dir);
+}
+
+
+/*
+ * wait for everybody to discover and open the hashing dir
+ *  then auth_unpin, to let the freeze happen
+ */
+void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(hash_gather[dir].count(from));
+  hash_gather[dir].erase(from);
+  
+  if (hash_gather[dir].empty()) {
+    hash_gather.erase(dir);
+    dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+    dir->auth_unpin();   // unpin to allow freeze to complete
+  } else {
+    dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
+  }
+  
+  delete m;  // done
+}
+
+
+
+/*
+ * once the dir is completely in memory,
+ *  mark all migrating inodes dirty (to pin in cache)
+ */
+void Migrator::hash_dir_complete(CDir *dir)
+{
+  dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+  assert(!dir->is_hashed());
+  assert(dir->is_auth());
+  
+  // mark dirty to pin in cache
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CInode *in = it->second->inode;
+    in->mark_dirty();
+  }
+  
+  if (dir->is_frozen_dir())
+    hash_dir_go(dir);
+}
+
+
+/*
+ * once the dir is frozen,
+ *  make sure it's complete
+ *  send the prep messages!
+ */
+void Migrator::hash_dir_frozen(CDir *dir)
+{
+  dout(7) << "hash_dir_frozen " << *dir << endl;
+  
+  assert(!dir->is_hashed());
+  assert(dir->is_auth());
+  assert(dir->is_frozen_dir());
+  
+  if (!dir->is_complete()) {
+    dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
+    return;  
+  }
+
+  // send prep messages w/ export directories to open
+  vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
+
+  // check for subdirs
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CDentry *dn = it->second;
+    CInode *in = dn->inode;
+    
+    if (!in->is_dir()) continue;
+    if (!in->dir) continue;
+    
+    int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+    if (dentryhashcode == mds->get_nodeid()) continue;
+
+    // msg?
+    if (msgs[dentryhashcode] == 0) {
+      msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
+    }
+    msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
+  }
+
+  // send them!
+  assert(hash_gather[dir].empty());
+  for (unsigned i=0; i<msgs.size(); i++) {
+    if (msgs[i]) {
+      mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+      hash_gather[dir].insert(i);
+    }
+  }
+  
+  if (hash_gather[dir].empty()) {
+    // no subdirs!  continue!
+    hash_gather.erase(dir);
+    hash_dir_go(dir);
+  } else {
+    // wait!
+  }
+}
+
+/* 
+ * wait for peers to open all subdirs
+ */
+void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+
+  int from = MSG_ADDR_NUM(m->get_source());
+
+  assert(hash_gather[dir].count(from) == 1);
+  hash_gather[dir].erase(from);
+
+  if (hash_gather[dir].empty()) {
+    hash_gather.erase(dir);
+    dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
+    hash_dir_go(dir);
+  } else {
+    dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;    
+  }
+
+  delete m;
+}
+
+
+/*
+ * once the dir is frozen,
+ *  make sure it's complete
+ *  do the hashing!
+ */
+void Migrator::hash_dir_go(CDir *dir)
+{
+  dout(7) << "hash_dir_go " << *dir << endl;
+  
+  assert(!dir->is_hashed());
+  assert(dir->is_auth());
+  assert(dir->is_frozen_dir());
+
+  // get messages to other nodes ready
+  vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+    msgs[i] = new MHashDir(dir->ino());
+  }
+
+  // pick a hash seed.
+  dir->inode->inode.hash_seed = 1;//dir->ino();
+
+  // suck up all waiters
+  C_Contexts *fin = new C_Contexts;
+  list<Context*> waiting;
+  dir->take_waiting(CDIR_WAIT_ANY, waiting);    // all dir waiters
+  fin->take(waiting);
+  
+  // get containing import.  might be me.
+  CDir *containing_import = cache->get_auth_container(dir);
+  assert(containing_import != dir || dir->is_import());  
+
+  // divy up contents
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CDentry *dn = it->second;
+    CInode *in = dn->inode;
+
+    int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+    if (dentryhashcode == mds->get_nodeid()) {
+      continue;      // still mine!
+    }
+
+    bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
+    assert(bl);
+    
+    // -- dentry
+    dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+    _encode(it->first, *bl);
+    
+    // null dentry?
+    if (dn->is_null()) {
+      bl->append("N", 1);  // null dentry
+      assert(dn->is_sync());
+      continue;
+    }
+
+    if (dn->is_remote()) {
+      // remote link
+      bl->append("L", 1);  // remote link
+
+      inodeno_t ino = dn->get_remote_ino();
+      bl->append((char*)&ino, sizeof(ino));
+      continue;
+    }
+
+    // primary link
+    // -- inode
+    bl->append("I", 1);    // inode dentry
+    
+    encode_export_inode(in, *bl, dentryhashcode);  // encode, and (update state for) export
+    msgs[dentryhashcode]->inc_nden();
+    
+    if (dn->is_dirty()) 
+      dn->mark_clean();
+
+    // add to proxy
+    hash_proxy_inos[dir].push_back(in);
+    in->state_set(CINODE_STATE_PROXY);
+    in->get(CINODE_PIN_PROXY);
+
+    // fix up subdirs
+    if (in->dir) {
+      if (in->dir->is_auth()) {
+        // mine.  make it into an import.
+        dout(7) << "making subdir into import " << *in->dir << endl;
+        in->dir->set_dir_auth( mds->get_nodeid() );
+        cache->imports.insert(in->dir);
+        in->dir->get(CDIR_PIN_IMPORT);
+        in->dir->state_set(CDIR_STATE_IMPORT);
+
+        // fix nested bits
+        for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+             it != cache->nested_exports[containing_import].end(); ) {
+          CDir *ex = *it;  
+          it++;
+          if (cache->get_auth_container(ex) == in->dir) {
+            dout(10) << "moving nested export " << *ex << endl;
+            cache->nested_exports[containing_import].erase(ex);
+            cache->nested_exports[in->dir].insert(ex);
+          }
+        }
+      }
+      else {
+        // not mine.
+        dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
+        assert(in->dir->is_export());
+        in->dir->put(CDIR_PIN_EXPORT);
+        in->dir->state_clear(CDIR_STATE_EXPORT);
+        cache->exports.erase(in->dir);
+        cache->nested_exports[containing_import].erase(in->dir);
+        if (in->dir->authority() == dentryhashcode)
+          in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+        else
+          in->dir->set_dir_auth( in->dir->authority() );
+      }
+    }
+    
+    // waiters
+    list<Context*> waiters;
+    in->take_waiting(CINODE_WAIT_ANY, waiters);
+    fin->take(waiters);
+  }
+
+  // dir state
+  dir->state_set(CDIR_STATE_HASHED);
+  dir->get(CDIR_PIN_HASHED);
+  cache->hashdirs.insert(dir);
+  dir->mark_dirty();
+  mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+  // inode state
+  if (dir->inode->is_auth()) {
+    dir->inode->mark_dirty();
+    mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+  }
+
+  // fix up nested_exports?
+  if (containing_import != dir) {
+    dout(7) << "moving nested exports under hashed dir" << endl;
+    for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+         it != cache->nested_exports[containing_import].end(); ) {
+      CDir *ex = *it;
+      it++;
+      if (cache->get_auth_container(ex) == dir) {
+        dout(7) << " moving nested export under hashed dir: " << *ex << endl;
+        cache->nested_exports[containing_import].erase(ex);
+        cache->nested_exports[dir].insert(ex);
+      } else {
+        dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
+      }
+    }
+  }
+
+  // send hash messages
+  assert(hash_gather[dir].empty());
+  assert(hash_notify_gather[dir].empty());
+  assert(dir->hashed_subset.empty());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    // all nodes hashed locally..
+    dir->hashed_subset.insert(i);
+
+    if (i == mds->get_nodeid()) continue;
+
+    // init hash_gather and hash_notify_gather sets
+    hash_gather[dir].insert(i);
+    
+    assert(hash_notify_gather[dir][i].empty());
+    for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
+      if (j == mds->get_nodeid()) continue;
+      if (j == i) continue;
+      hash_notify_gather[dir][i].insert(j);
+    }
+
+    mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+  }
+
+  // wait for all the acks.
+}
+
+
+void Migrator::handle_hash_dir_ack(MHashDirAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+
+  assert(dir->is_hashed());
+  assert(dir->is_hashing());
+
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(hash_gather[dir].count(from) == 1);
+  hash_gather[dir].erase(from);
+  
+  if (hash_gather[dir].empty()) {
+    dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
+
+    if (hash_notify_gather[dir].empty()) {
+      dout(7) << "got notifies too, all done" << endl;
+      hash_dir_finish(dir);
+    } else {
+      dout(7) << "waiting on notifies " << endl;
+    }
+
+  } else {
+    dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;    
+  }
+
+  delete m;
+}
+
+
+void Migrator::hash_dir_finish(CDir *dir)
+{
+  dout(7) << "hash_dir_finish finishing " << *dir << endl;
+  assert(dir->is_hashed());
+  assert(dir->is_hashing());
+  
+  // dir state
+  hash_gather.erase(dir);
+  dir->state_clear(CDIR_STATE_HASHING);
+  dir->put(CDIR_PIN_HASHING);
+  dir->hashed_subset.clear();
+
+  // unproxy inodes
+  //  this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
+  for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+       it != hash_proxy_inos[dir].end();
+       it++) {
+    CInode *in = *it;
+    assert(in->state_test(CINODE_STATE_PROXY));
+    in->state_clear(CINODE_STATE_PROXY);
+    in->put(CINODE_PIN_PROXY);
+  }
+  hash_proxy_inos.erase(dir);
+
+  // unpin path
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  cache->path_unpin(trace, 0);
+
+  // unfreeze
+  dir->unfreeze_dir();
+
+  show_imports();
+  assert(hash_gather.count(dir) == 0);
+
+  // stats
+  //mds->logger->inc("nh", 1);
+
+}
+
+
+
+
+// HASH on auth and non-auth
+
+void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  assert(dir->is_hashing());
+
+  dout(5) << "handle_hash_dir_notify " << *dir << endl;
+  int from = m->get_from();
+
+  int source = MSG_ADDR_NUM(m->get_source());
+  if (dir->is_auth()) {
+    // gather notifies
+    assert(dir->is_hashed());
+    
+    assert(    hash_notify_gather[dir][from].count(source) );
+    hash_notify_gather[dir][from].erase(source);
+    
+    if (hash_notify_gather[dir][from].empty()) {
+      dout(7) << "last notify from " << from << endl;
+      hash_notify_gather[dir].erase(from);
+
+      if (hash_notify_gather[dir].empty()) {
+        dout(7) << "last notify!" << endl;
+        hash_notify_gather.erase(dir);
+        
+        if (hash_gather[dir].empty()) {
+          dout(7) << "got acks too, all done" << endl;
+          hash_dir_finish(dir);
+        } else {
+          dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
+        }
+      } else {
+        dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
+      }
+    } else {
+      dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
+    }
+
+    // delete msg
+    delete m;
+  } else {
+    // update dir hashed_subset 
+    assert(dir->hashed_subset.count(from) == 0);
+    dir->hashed_subset.insert(from);
+    
+    // update open subdirs
+    for (CDir_map_t::iterator it = dir->begin(); 
+         it != dir->end(); 
+         it++) {
+      CDentry *dn = it->second;
+      CInode *in = dn->get_inode();
+      if (!in) continue;
+      if (!in->dir) continue;
+      
+      int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+      if (dentryhashcode != from) continue;   // we'll import these in a minute
+      
+      if (in->dir->authority() != dentryhashcode)
+        in->dir->set_dir_auth( in->dir->authority() );
+      else
+        in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+    }
+    
+    // remove from notify gather set
+    assert(hash_gather[dir].count(from));
+    hash_gather[dir].erase(from);
+
+    // last notify?
+    if (hash_gather[dir].empty()) {
+      dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
+      hash_gather.erase(dir);
+      
+      dir->state_clear(CDIR_STATE_HASHING);
+      dir->put(CDIR_PIN_HASHING);
+      dir->hashed_subset.clear();
+    } else {
+      dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+    }
+
+    // fw notify to auth
+    mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
+  }
+}
+
+
+
+
+// HASH on non-auth
+
+/*
+ * discover step:
+ *  each peer needs to open up the directory and pin it before we start
+ */
+class C_MDC_HashDirDiscover : public Context {
+  Migrator *mig;
+  MHashDirDiscover *m;
+public:
+  vector<CDentry*> trace;
+  C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
+    this->mig = mig;
+    this->m = m;
+  }
+  void finish(int r) {
+    CInode *in = 0;
+    if (r >= 0) {
+      if (trace.size())
+        in = trace[trace.size()-1]->get_inode();
+      else
+        in = mig->cache->get_root();
+    }
+    mig->handle_hash_dir_discover_2(m, in, r);
+  }
+};  
+
+void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
+{
+  assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+  dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
+
+  // must discover it!
+  C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
+  filepath fpath(m->get_path());
+  cache->path_traverse(fpath, onfinish->trace, true,
+		       m, new C_MDS_RetryMessage(mds,m),       // on delay/retry
+		       MDS_TRAVERSE_DISCOVER,
+		       onfinish);  // on completion|error
+}
+
+void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
+{
+  // yay!
+  if (in) {
+    dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
+  }
+
+  if (r < 0 || !in->is_dir()) {
+    dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+    assert(0);    // this shouldn't happen if the auth pins his path properly!!!! 
+  }
+  assert(in->is_dir());
+
+  // is dir open?
+  if (!in->dir) {
+    dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
+    cache->open_remote_dir(in,
+			   new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+  CDir *dir = in->dir;
+
+  // pin dir, set hashing flag
+  dir->state_set(CDIR_STATE_HASHING);
+  dir->get(CDIR_PIN_HASHING);
+  assert(dir->hashed_subset.empty());
+  
+  // inode state
+  dir->inode->inode.hash_seed = 1;// dir->ino();
+  if (dir->inode->is_auth()) {
+    dir->inode->mark_dirty();
+    mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+  }
+
+  // get gather set ready for notifies
+  assert(hash_gather[dir].empty());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+    if (i == dir->authority()) continue;
+    hash_gather[dir].insert(i);
+  }
+
+  // reply
+  dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
+  mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
+			m->get_source().num(), MDS_PORT_MIGRATOR);
+  delete m;
+}
+
+/*
+ * prep step:
+ *  peers need to open up all subdirs of the hashed dir
+ */
+
+void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_hash_dir_prep " << *dir << endl;
+
+  if (!m->did_assim()) {
+    m->mark_assim();  // only do this the first time!
+
+    // assimilate dentry+inodes for exports
+    for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+         it != m->get_inodes().end();
+         it++) {
+      CInode *in = cache->get_inode( it->second->get_ino() );
+      if (in) {
+        it->second->update_inode(in);
+        dout(5) << " updated " << *in << endl;
+      } else {
+        in = new CInode(mds->mdcache, false);
+        it->second->update_inode(in);
+        cache->add_inode(in);
+        
+        // link 
+        dir->add_dentry( it->first, in );
+        dout(5) << "   added " << *in << endl;
+      }
+
+      // open!
+      if (!in->dir) {
+        dout(5) << "  opening nested export on " << *in << endl;
+        cache->open_remote_dir(in,
+			       new C_MDS_RetryMessage(mds, m));
+      }
+    }
+  }
+
+  // verify!
+  int waiting_for = 0;
+  for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+       it != m->get_inodes().end();
+       it++) {
+    CInode *in = cache->get_inode( it->second->get_ino() );
+    assert(in);
+
+    if (in->dir) {
+      if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+        dout(5) << "  pinning nested export " << *in->dir << endl;
+        in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+        in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+      } else {
+        dout(5) << "  already pinned nested export " << *in << endl;
+      }
+    } else {
+      dout(5) << "  waiting for nested export dir on " << *in << endl;
+      waiting_for++;
+    }
+  }
+
+  if (waiting_for) {
+    dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+    return;
+  } 
+
+  // ack!
+  mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
+			m->get_source().num(), MDS_PORT_MIGRATOR);
+  
+  // done.
+  delete m;
+}
+
+
+/*
+ * hash step:
+ */
+
+void Migrator::handle_hash_dir(MHashDir *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  assert(!dir->is_auth());
+  assert(!dir->is_hashed());
+  assert(dir->is_hashing());
+
+  dout(5) << "handle_hash_dir " << *dir << endl;
+  int oldauth = MSG_ADDR_NUM(m->get_source());
+
+  // content
+  import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
+
+  // dir state
+  dir->state_set(CDIR_STATE_HASHED);
+  dir->get(CDIR_PIN_HASHED);
+  cache->hashdirs.insert(dir);
+  dir->hashed_subset.insert(mds->get_nodeid());
+
+  // dir is complete
+  dir->mark_complete();
+  dir->mark_dirty();
+  mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+  // commit
+  mds->mdstore->commit_dir(dir, 0);
+  
+  // send notifies
+  dout(7) << "sending notifies" << endl;
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+    if (i == MSG_ADDR_NUM(m->get_source())) continue;
+    mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
+			  i, MDS_PORT_MIGRATOR);
+  }
+
+  // ack
+  dout(7) << "acking" << endl;
+  mds->send_message_mds(new MHashDirAck(dir->ino()),
+			m->get_source().num(), MDS_PORT_MIGRATOR);
+  
+  // done.
+  delete m;
+
+  show_imports();
+}
+
+
+
+
+
+// UNHASH on auth
+
+class C_MDC_UnhashFreeze : public Context {
+public:
+  Migrator *mig;
+  CDir *dir;
+  C_MDC_UnhashFreeze(Migrator *m, CDir *d)  : mig(m), dir(d) {}
+  virtual void finish(int r) {
+    mig->unhash_dir_frozen(dir);
+  }
+};
+
+class C_MDC_UnhashComplete : public Context {
+public:
+  Migrator *mig;
+  CDir *dir;
+  C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
+  virtual void finish(int r) {
+    mig->unhash_dir_complete(dir);
+  }
+};
+
+
+void Migrator::unhash_dir(CDir *dir)
+{
+  dout(-7) << "unhash_dir " << *dir << endl;
+
+  assert(dir->is_hashed());
+  assert(!dir->is_unhashing());
+  assert(dir->is_auth());
+  assert(hash_gather.count(dir)==0);
+
+  // pin path?
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  if (!cache->path_pin(trace, 0, 0)) {
+    dout(7) << "unhash_dir couldn't pin path, failing." << endl;
+    return;
+  }
+
+  // twiddle state
+  dir->state_set(CDIR_STATE_UNHASHING);
+
+  // first, freeze the dir.
+  dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
+
+  // make complete
+  if (!dir->is_complete()) {
+    dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+    mds->mdstore->fetch_dir(dir,
+                            new C_MDC_UnhashComplete(this, dir));
+  } else
+    unhash_dir_complete(dir);
+
+}
+
+void Migrator::unhash_dir_frozen(CDir *dir)
+{
+  dout(7) << "unhash_dir_frozen " << *dir << endl;
+  
+  assert(dir->is_hashed());
+  assert(dir->is_auth());
+  assert(dir->is_frozen_dir());
+  
+  if (!dir->is_complete()) {
+    dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
+  } else
+    unhash_dir_prep(dir);
+}
+
+
+/*
+ * ask peers to freeze and complete hashed dir
+ */
+void Migrator::unhash_dir_prep(CDir *dir)
+{
+  dout(7) << "unhash_dir_prep " << *dir << endl;
+  assert(dir->is_hashed());
+  assert(dir->is_auth());
+  assert(dir->is_frozen_dir());
+  assert(dir->is_complete());
+
+  if (!hash_gather[dir].empty()) return;  // already been here..freeze must have been instantaneous
+
+  // send unhash prep to all peers
+  assert(hash_gather[dir].empty());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+    hash_gather[dir].insert(i);
+    mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
+			  i, MDS_PORT_MIGRATOR);
+  }
+}
+
+/* 
+ * wait for peers to freeze and complete hashed dirs
+ */
+void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  int from = MSG_ADDR_NUM(m->get_source());
+  dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
+
+  if (!m->did_assim()) {
+    m->mark_assim();  // only do this the first time!
+    
+    // assimilate dentry+inodes for exports
+    for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+         it != m->get_inodes().end();
+         it++) {
+      CInode *in = cache->get_inode( it->second->get_ino() );
+      if (in) {
+        it->second->update_inode(in);
+        dout(5) << " updated " << *in << endl;
+      } else {
+        in = new CInode(mds->mdcache, false);
+        it->second->update_inode(in);
+        cache->add_inode(in);
+        
+        // link 
+        dir->add_dentry( it->first, in );
+        dout(5) << "   added " << *in << endl;
+      }
+      
+      // open!
+      if (!in->dir) {
+        dout(5) << "  opening nested export on " << *in << endl;
+        cache->open_remote_dir(in,
+			       new C_MDS_RetryMessage(mds, m));
+      }
+    }
+  }
+  
+  // verify!
+  int waiting_for = 0;
+  for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+       it != m->get_inodes().end();
+       it++) {
+    CInode *in = cache->get_inode( it->second->get_ino() );
+    assert(in);
+    
+    if (in->dir) {
+      if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+        dout(5) << "  pinning nested export " << *in->dir << endl;
+        in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+        in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+      } else {
+        dout(5) << "  already pinned nested export " << *in << endl;
+      }
+    } else {
+      dout(5) << "  waiting for nested export dir on " << *in << endl;
+      waiting_for++;
+    }
+  }
+  
+  if (waiting_for) {
+    dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+    return;
+  } 
+  
+  // ok, done with this PrepAck
+  assert(hash_gather[dir].count(from) == 1);
+  hash_gather[dir].erase(from);
+  
+  if (hash_gather[dir].empty()) {
+    hash_gather.erase(dir);
+    dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
+    unhash_dir_go(dir);
+  } else {
+    dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;    
+  }
+  
+  delete m;
+}
+
+
+/*
+ * auth:
+ *  send out MHashDir's to peers
+ */
+void Migrator::unhash_dir_go(CDir *dir)
+{
+  dout(7) << "unhash_dir_go " << *dir << endl;
+  assert(dir->is_hashed());
+  assert(dir->is_auth());
+  assert(dir->is_frozen_dir());
+  assert(dir->is_complete());
+
+  // send unhash prep to all peers
+  assert(hash_gather[dir].empty());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+    hash_gather[dir].insert(i);
+    mds->send_message_mds(new MUnhashDir(dir->ino()),
+			  i, MDS_PORT_MIGRATOR);
+  }
+}
+
+/*
+ * auth:
+ *  assimilate unhashing content
+ */
+void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+  assert(dir->is_hashed());
+
+  // assimilate content
+  int from = MSG_ADDR_NUM(m->get_source());
+  import_hashed_content(dir, m->get_state(), m->get_nden(), from);
+  delete m;
+
+  // done?
+  assert(hash_gather[dir].count(from));
+  hash_gather[dir].erase(from);
+  
+  if (!hash_gather[dir].empty()) {
+    dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
+    return;
+  } 
+
+  // done!
+  
+  // fix up nested_exports
+  CDir *containing_import = cache->get_auth_container(dir);
+  if (containing_import != dir) {
+    for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
+         it != cache->nested_exports[dir].end();
+         it++) {
+      dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
+      cache->nested_exports[containing_import].insert(*it);
+    }
+    cache->nested_exports.erase(dir);
+  }
+  
+  // dir state
+  //dir->state_clear(CDIR_STATE_UNHASHING); //later
+  dir->state_clear(CDIR_STATE_HASHED);
+  dir->put(CDIR_PIN_HASHED);
+  cache->hashdirs.erase(dir);
+  
+  // commit!
+  assert(dir->is_complete());
+  //dir->mark_complete();
+  dir->mark_dirty();
+  mds->mdstore->commit_dir(dir, 0);
+
+  // inode state
+  dir->inode->inode.hash_seed = 0;
+  if (dir->inode->is_auth()) {
+    dir->inode->mark_dirty();
+    mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+  }
+  
+  // notify
+  assert(hash_gather[dir].empty());
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i == mds->get_nodeid()) continue;
+
+    hash_gather[dir].insert(i);
+    
+    mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
+			  i, MDS_PORT_MIGRATOR);
+  }
+}
+
+
+/*
+ * sent by peer to flush mds links.  unfreeze when all gathered.
+ */
+void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+  assert(!dir->is_hashed());
+  assert(dir->is_unhashing());
+  assert(dir->is_frozen_dir());
+
+  // done?
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(hash_gather[dir].count(from));
+  hash_gather[dir].erase(from);
+  delete m;
+
+  if (!hash_gather[dir].empty()) {
+    dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
+  } else {
+    unhash_dir_finish(dir);
+  }  
+}
+
+
+/*
+ * all mds links are flushed.  unfreeze dir!
+ */
+void Migrator::unhash_dir_finish(CDir *dir)
+{
+  dout(7) << "unhash_dir_finish " << *dir << endl;
+  hash_gather.erase(dir);
+
+  // unpin path
+  vector<CDentry*> trace;
+  cache->make_trace(trace, dir->inode);
+  cache->path_unpin(trace, 0);
+
+  // state
+  dir->state_clear(CDIR_STATE_UNHASHING);
+
+  // unfreeze
+  dir->unfreeze_dir();
+
+}
+
+
+
+// UNHASH on all
+
+/*
+ * hashed dir is complete.  
+ *  mark all migrating inodes dirty (to pin in cache)
+ *  if frozen too, then go to next step (depending on auth)
+ */
+void Migrator::unhash_dir_complete(CDir *dir)
+{
+  dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
+  
+  assert(dir->is_hashed());
+  assert(dir->is_complete());
+  
+  // mark dirty to pin in cache
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CInode *in = it->second->inode;
+    if (in->is_auth()) {
+      in->mark_dirty();
+      mds->mdlog->submit_entry(new EInodeUpdate(in));
+    }
+  }
+  
+  if (!dir->is_frozen_dir()) {
+    dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
+  } else {
+    if (dir->is_auth())
+      unhash_dir_prep(dir);            // auth
+    else
+      unhash_dir_prep_finish(dir);  // nonauth
+  }
+}
+
+
+// UNHASH on non-auth
+
+class C_MDC_UnhashPrepFreeze : public Context {
+public:
+  Migrator *mig;
+  CDir *dir;
+  C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+  virtual void finish(int r) {
+    mig->unhash_dir_prep_frozen(dir);
+  }
+};
+
+
+/*
+ * peers need to freeze their dir and make them complete
+ */
+void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_unhash_dir_prep " << *dir << endl;
+  assert(dir->is_hashed());
+
+  // freeze
+  dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
+
+  // make complete
+  if (!dir->is_complete()) {
+    dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+    mds->mdstore->fetch_dir(dir,
+                            new C_MDC_UnhashComplete(this, dir));
+  } else {
+    unhash_dir_complete(dir);
+  }
+  
+  delete m;
+}
+
+/*
+ * peer has hashed dir frozen.  
+ *  complete too?
+ */
+void Migrator::unhash_dir_prep_frozen(CDir *dir)
+{
+  dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
+  
+  assert(dir->is_hashed());
+  assert(dir->is_frozen_dir());
+  assert(!dir->is_auth());
+  
+  if (!dir->is_complete()) {
+    dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
+  } else
+    unhash_dir_prep_finish(dir);
+}
+
+/*
+ * peer has hashed dir complete and frozen.  ack.
+ */
+void Migrator::unhash_dir_prep_finish(CDir *dir)
+{
+  dout(7) << "unhash_dir_prep_finish " << *dir << endl;
+  assert(dir->is_hashed());
+  assert(!dir->is_auth());
+  assert(dir->is_frozen());
+  assert(dir->is_complete());
+  
+  // twiddle state
+  if (dir->is_unhashing())
+    return;  // already replied.
+  dir->state_set(CDIR_STATE_UNHASHING);
+
+  // send subdirs back to auth
+  MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
+  int auth = dir->authority();
+  
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CDentry *dn = it->second;
+    CInode *in = dn->inode;
+    
+    if (!in->is_dir()) continue;
+    if (!in->dir) continue;
+    
+    int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+    if (dentryhashcode != mds->get_nodeid()) continue;
+    
+    // msg?
+    ack->add_inode(it->first, in->replicate_to(auth));
+  }
+  
+  // ack
+  mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
+}
+
+
+
+/*
+ * peer needs to send hashed dir content back to auth.
+ *  unhash dir.
+ */
+void Migrator::handle_unhash_dir(MUnhashDir *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
+  assert(dir->is_hashed());
+  assert(dir->is_unhashing());
+  assert(!dir->is_auth());
+  
+  // get message ready
+  bufferlist bl;
+  int nden = 0;
+
+  // suck up all waiters
+  C_Contexts *fin = new C_Contexts;
+  list<Context*> waiting;
+  dir->take_waiting(CDIR_WAIT_ANY, waiting);    // all dir waiters
+  fin->take(waiting);
+  
+  // divy up contents
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CDentry *dn = it->second;
+    CInode *in = dn->inode;
+
+    int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+    if (dentryhashcode != mds->get_nodeid()) {
+      // not mine!
+      // twiddle dir_auth?
+      if (in->dir) {
+        if (in->dir->authority() != dir->authority())
+          in->dir->set_dir_auth( in->dir->authority() );
+        else
+          in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+      }
+      continue;
+    }
+    
+    // -- dentry
+    dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+    _encode(it->first, bl);
+    
+    // null dentry?
+    if (dn->is_null()) {
+      bl.append("N", 1);  // null dentry
+      assert(dn->is_sync());
+      continue;
+    }
+
+    if (dn->is_remote()) {
+      // remote link
+      bl.append("L", 1);  // remote link
+
+      inodeno_t ino = dn->get_remote_ino();
+      bl.append((char*)&ino, sizeof(ino));
+      continue;
+    }
+
+    // primary link
+    // -- inode
+    bl.append("I", 1);    // inode dentry
+    
+    encode_export_inode(in, bl, dentryhashcode);  // encode, and (update state for) export
+    nden++;
+
+    if (dn->is_dirty()) 
+      dn->mark_clean();
+
+    // proxy
+    in->state_set(CINODE_STATE_PROXY);
+    in->get(CINODE_PIN_PROXY);
+    hash_proxy_inos[dir].push_back(in);
+
+    if (in->dir) {
+      if (in->dir->is_auth()) {
+        // mine.  make it into an import.
+        dout(7) << "making subdir into import " << *in->dir << endl;
+        in->dir->set_dir_auth( mds->get_nodeid() );
+        cache->imports.insert(in->dir);
+        in->dir->get(CDIR_PIN_IMPORT);
+        in->dir->state_set(CDIR_STATE_IMPORT);
+      }
+      else {
+        // not mine.
+        dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
+        assert(in->dir->is_export());
+        in->dir->put(CDIR_PIN_EXPORT);
+        in->dir->state_clear(CDIR_STATE_EXPORT);
+        cache->exports.erase(in->dir);
+        cache->nested_exports[dir].erase(in->dir);
+      }
+    }
+    
+    // waiters
+    list<Context*> waiters;
+    in->take_waiting(CINODE_WAIT_ANY, waiters);
+    fin->take(waiters);
+  }
+
+  // we should have no nested exports; we're not auth for the dir!
+  assert(cache->nested_exports[dir].empty());
+  cache->nested_exports.erase(dir);
+
+  // dir state
+  //dir->state_clear(CDIR_STATE_UNHASHING);  // later
+  dir->state_clear(CDIR_STATE_HASHED);
+  dir->put(CDIR_PIN_HASHED);
+  cache->hashdirs.erase(dir);
+  dir->mark_clean();
+
+  // inode state
+  dir->inode->inode.hash_seed = 0;
+  if (dir->inode->is_auth()) {
+    dir->inode->mark_dirty();
+    mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+  }
+
+  // init gather set
+  hash_gather[dir] = mds->get_mds_map()->get_mds();
+  hash_gather[dir].erase(mds->get_nodeid());
+
+  // send unhash message
+  mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
+			dir->authority(), MDS_PORT_MIGRATOR);
+}
+
+
+/*
+ * first notify comes from auth.
+ *  send notifies to all other peers, with peer = self
+ * if we get notify from peer=other, remove from our gather list.
+ * when we've gotten notifies from everyone,
+ *  unpin proxies,
+ *  send notify_ack to auth.
+ * this ensures that all mds links are flushed of cache_expire type messages.
+ */
+void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  
+  dout(7) << "handle_unhash_dir_finish " << *dir << endl;
+  assert(!dir->is_hashed());
+  assert(dir->is_unhashing());
+  assert(!dir->is_auth());
+  
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(hash_gather[dir].count(from) == 1);
+  hash_gather[dir].erase(from);
+  delete m;
+
+  // did we send our shout out?
+  if (from == dir->authority()) {
+    // send notify to everyone else in weird chatter storm
+    for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+      if (i == from) continue;
+      if (i == mds->get_nodeid()) continue;
+      mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
+    }
+  }
+
+  // are we done?
+  if (!hash_gather[dir].empty()) {
+    dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+    return;
+  }
+  hash_gather.erase(dir);
+
+  // all done!
+  dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
+
+  // unpin proxies
+  for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+       it != hash_proxy_inos[dir].end();
+       it++) {
+    CInode *in = *it;
+    assert(in->state_test(CINODE_STATE_PROXY));
+    in->state_clear(CINODE_STATE_PROXY);
+    in->put(CINODE_PIN_PROXY);
+  }
+
+  // unfreeze
+  dir->unfreeze_dir();
+  
+  // ack
+  dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
+  mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+  
+}
+
+
+
+
+void Migrator::show_imports()
+{
+  mds->balancer->show_imports();
+}
diff --git a/branches/sage/cephmds2/mds/Migrator.h b/branches/sage/cephmds2/mds/Migrator.h
new file mode 100644
index 0000000000000..eac7d2046690b
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Migrator.h
@@ -0,0 +1,199 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_MIGRATOR_H
+#define __MDS_MIGRATOR_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class MExportDirDiscover;
+class MExportDirDiscoverAck;
+class MExportDirPrep;
+class MExportDirPrepAck;
+class MExportDirWarning;
+class MExportDir;
+class MExportDirNotify;
+class MExportDirNotifyAck;
+class MExportDirFinish;
+
+class MHashDirDiscover;
+class MHashDirDiscoverAck;
+class MHashDirPrep;
+class MHashDirPrepAck;
+class MHashDir;
+class MHashDirAck;
+class MHashDirNotify;
+
+class MUnhashDirPrep;
+class MUnhashDirPrepAck;
+class MUnhashDir;
+class MUnhashDirAck;
+class MUnhashDirNotify;
+class MUnhashDirNotifyAck;
+
+class Migrator {
+private:
+  MDS *mds;
+  MDCache *cache;
+
+  // export fun
+  map<CDir*, set<int> >  export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
+  map<CDir*, list<inodeno_t> > export_proxy_inos;
+  map<CDir*, list<inodeno_t> > export_proxy_dirinos;
+  
+  set<inodeno_t>                    stray_export_warnings; // notifies i haven't seen
+  map<inodeno_t, MExportDirNotify*> stray_export_notifies;
+  
+  // hashing madness
+  multimap<CDir*, int>   unhash_waiting;  // nodes i am waiting for UnhashDirAck's from
+  multimap<inodeno_t, inodeno_t>    import_hashed_replicate_waiting;  // nodes i am waiting to discover to complete my import of a hashed dir
+  // maps frozen_dir_ino's to waiting-for-discover ino's.
+  multimap<inodeno_t, inodeno_t>    import_hashed_frozen_waiting;    // dirs i froze (for the above)
+    
+public:
+  // -- cons --
+  Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {}
+
+  void dispatch(Message*);
+
+  // -- import/export --
+  // exporter
+ public:
+  void export_dir(CDir *dir,
+                  int mds);
+  void export_empty_import(CDir *dir);
+
+  void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth);
+  void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth);
+
+ protected:
+  map< CDir*, set<int> > export_gather;
+  void handle_export_dir_discover_ack(MExportDirDiscoverAck *m);
+  void export_dir_frozen(CDir *dir, int dest);
+  void handle_export_dir_prep_ack(MExportDirPrepAck *m);
+  void export_dir_go(CDir *dir,
+                     int dest);
+  int export_dir_walk(MExportDir *req,
+                      class C_Contexts *fin,
+                      CDir *basedir,
+                      CDir *dir,
+                      int newauth);
+  void export_dir_finish(CDir *dir);
+  void handle_export_dir_notify_ack(MExportDirNotifyAck *m);
+  
+  
+  friend class C_MDC_ExportFreeze;
+
+  // importer
+  void handle_export_dir_discover(MExportDirDiscover *m);
+  void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r);
+  void handle_export_dir_prep(MExportDirPrep *m);
+  void handle_export_dir(MExportDir *m);
+  void import_dir_finish(CDir *dir);
+  void handle_export_dir_finish(MExportDirFinish *m);
+  int import_dir_block(bufferlist& bl,
+                       int& off,
+                       int oldauth,
+                       CDir *import_root,
+                       list<inodeno_t>& imported_subdirs);
+  void got_hashed_replica(CDir *import,
+                          inodeno_t dir_ino,
+                          inodeno_t replica_ino);
+
+
+  friend class C_MDC_ExportDirDiscover;
+
+  // bystander
+  void handle_export_dir_warning(MExportDirWarning *m);
+  void handle_export_dir_notify(MExportDirNotify *m);
+
+  void show_imports();
+
+  // -- hashed directories --
+
+  // HASH
+ public:
+  void hash_dir(CDir *dir);  // on auth
+ protected:
+  map< CDir*, set<int> >             hash_gather;
+  map< CDir*, map< int, set<int> > > hash_notify_gather;
+  map< CDir*, list<CInode*> >        hash_proxy_inos;
+
+  // hash on auth
+  void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m);
+  void hash_dir_complete(CDir *dir);
+  void hash_dir_frozen(CDir *dir);
+  void handle_hash_dir_prep_ack(MHashDirPrepAck *m);
+  void hash_dir_go(CDir *dir);
+  void handle_hash_dir_ack(MHashDirAck *m);
+  void hash_dir_finish(CDir *dir);
+  friend class C_MDC_HashFreeze;
+  friend class C_MDC_HashComplete;
+
+  // auth and non-auth
+  void handle_hash_dir_notify(MHashDirNotify *m);
+
+  // hash on non-auth
+  void handle_hash_dir_discover(MHashDirDiscover *m);
+  void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r);
+  void handle_hash_dir_prep(MHashDirPrep *m);
+  void handle_hash_dir(MHashDir *m);
+  friend class C_MDC_HashDirDiscover;
+
+  // UNHASH
+ public:
+  void unhash_dir(CDir *dir);   // on auth
+ protected:
+  map< CDir*, list<MUnhashDirAck*> > unhash_content;
+  void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth);
+
+  // unhash on auth
+  void unhash_dir_frozen(CDir *dir);
+  void unhash_dir_prep(CDir *dir);
+  void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m);
+  void unhash_dir_go(CDir *dir);
+  void handle_unhash_dir_ack(MUnhashDirAck *m);
+  void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m);
+  void unhash_dir_finish(CDir *dir);
+  friend class C_MDC_UnhashFreeze;
+  friend class C_MDC_UnhashComplete;
+
+  // unhash on all
+  void unhash_dir_complete(CDir *dir);
+
+  // unhash on non-auth
+  void handle_unhash_dir_prep(MUnhashDirPrep *m);
+  void unhash_dir_prep_frozen(CDir *dir);
+  void unhash_dir_prep_finish(CDir *dir);
+  void handle_unhash_dir(MUnhashDir *m);
+  void handle_unhash_dir_notify(MUnhashDirNotify *m);
+  friend class C_MDC_UnhashPrepFreeze;
+
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/OSDMonitor.cc b/branches/sage/cephmds2/mds/OSDMonitor.cc
new file mode 100644
index 0000000000000..0c7cadbce3a6d
--- /dev/null
+++ b/branches/sage/cephmds2/mds/OSDMonitor.cc
@@ -0,0 +1,523 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "OSDMonitor.h"
+
+#include "osd/OSDMap.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
+#define  derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
+
+
+class C_OM_PingTick : public Context {
+public:
+  Messenger *msgr;
+  C_OM_PingTick(Messenger *m) : msgr(m) {}
+  void finish(int r) {
+    msgr->send_message(new MPing, MSG_ADDR_MON(0));
+  }
+};
+
+class C_OM_Faker : public Context {
+public:
+  OSDMonitor *om;
+  C_OM_Faker(OSDMonitor *m) { 
+    this->om = m;
+  }
+  void finish(int r) {
+    om->fake_reorg();
+  }
+};
+
+class C_OM_FakeOSDFailure : public Context {
+  OSDMonitor *mon;
+  int osd;
+  bool down;
+public:
+  C_OM_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {}
+  void finish(int r) {
+    mon->fake_osd_failure(osd,down);
+  }
+};
+
+
+
+void OSDMonitor::fake_osdmap_update()
+{
+  dout(1) << "fake_osdmap_update" << endl;
+  accept_pending();
+
+  // tell a random osd
+  send_incremental_map(osdmap->get_epoch()-1,                    // ick! FIXME
+                       MSG_ADDR_OSD(rand() % g_conf.num_osd));
+}
+
+
+void OSDMonitor::fake_reorg() 
+{
+  int r = rand() % g_conf.num_osd;
+  
+  if (osdmap->is_out(r)) {
+    dout(1) << "fake_reorg marking osd" << r << " in" << endl;
+    pending.new_in.push_back(r);
+  } else {
+    dout(1) << "fake_reorg marking osd" << r << " out" << endl;
+    pending.new_out.push_back(r);
+  }
+
+  accept_pending();
+  
+  // tell him!
+  send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(r));
+}
+
+
+void OSDMonitor::init()
+{
+  dout(1) << "init" << endl;
+
+
+  // <HACK set up OSDMap from g_conf>
+  osdmap = new OSDMap();
+  osdmap->set_pg_bits(g_conf.osd_pg_bits);
+
+  // start at epoch 0 until all osds boot
+  //osdmap->inc_epoch();  // = 1
+  //assert(osdmap->get_epoch() == 1);
+
+
+  //if (g_conf.mkfs) osdmap->set_mkfs();
+
+  Bucket *b = new UniformBucket(1, 0);
+  int root = osdmap->crush.add_bucket(b);
+  for (int i=0; i<g_conf.num_osd; i++) {
+    osdmap->osds.insert(i);
+    b->add_item(i, 1);
+  }
+  
+  for (int i=1; i<5; i++) {
+    osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+    osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
+    osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+  }
+
+  if (g_conf.mds_local_osd) {
+    // add mds osds, but don't put them in the crush mapping func
+    for (int i=0; i<g_conf.num_mds; i++) 
+      osdmap->osds.insert(i+10000);
+  }
+
+  // </HACK>
+
+
+  
+  if (whoami == 0 &&
+      g_conf.num_osd > 4 &&
+      g_conf.fake_osdmap_expand) {
+    dout(1) << "scheduling OSD map reorg at " << g_conf.fake_osdmap_expand << endl;
+    g_timer.add_event_after(g_conf.fake_osdmap_expand,
+                            new C_OM_Faker(this));
+  }
+
+  if (whoami == 0) {
+    // fake osd failures
+    for (map<int,float>::iterator i = g_fake_osd_down.begin();
+         i != g_fake_osd_down.end();
+         i++) {
+      dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
+      g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 1));
+    }
+    for (map<int,float>::iterator i = g_fake_osd_out.begin();
+         i != g_fake_osd_out.end();
+         i++) {
+      dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
+      g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 0));
+    }
+  }
+
+  
+  // i'm ready!
+  messenger->set_dispatcher(this);
+  
+  // start ticker
+  g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
+}
+
+
+void OSDMonitor::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_OSD_FAILURE:
+    handle_osd_failure((MOSDFailure*)m);
+    break;
+    
+  case MSG_PING_ACK:
+    handle_ping_ack((MPingAck*)m);
+    break;
+
+  case MSG_OSD_GETMAP:
+    handle_osd_getmap((MOSDGetMap*)m);
+    return;
+
+  case MSG_OSD_BOOT:
+    handle_osd_boot((MOSDBoot*)m);
+    return;
+
+  case MSG_OSD_IN:
+    handle_osd_in((MOSDIn*)m);
+    break;
+  case MSG_OSD_OUT:
+    handle_osd_out((MOSDOut*)m);
+    break;
+
+  case MSG_SHUTDOWN:
+    handle_shutdown(m);
+    return;
+
+  case MSG_PING:
+    tick();
+    delete m;
+    return;
+
+  default:
+    dout(0) << "unknown message " << *m << endl;
+    assert(0);
+  }
+}
+
+
+void OSDMonitor::handle_shutdown(Message *m)
+{
+  dout(1) << "shutdown from " << m->get_source() << endl;
+  messenger->shutdown();
+  delete messenger;
+  delete m;
+}
+
+void OSDMonitor::handle_ping_ack(MPingAck *m)
+{
+  // ...
+  
+  delete m;
+}
+
+void OSDMonitor::handle_osd_failure(MOSDFailure *m)
+{
+  dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl;
+  
+  // FIXME?
+
+  // take their word for it
+  int from = m->get_failed().num();
+  if (osdmap->is_up(from) &&
+      (osdmap->osd_inst.count(from) == 0 ||
+       osdmap->osd_inst[from] == m->get_inst())) {
+    pending.new_down[from] = m->get_inst();
+
+    if (osdmap->is_in(from))
+      pending_out[from] = g_clock.now();
+    
+    //awaiting_maps[pending.epoch][m->get_source()] = 
+
+    accept_pending();
+    bcast_latest_osd_map_mds();   
+    //bcast_latest_osd_map_osd();   // FIXME: which osds can i tell?
+  }
+
+  send_incremental_map(m->get_epoch(), m->get_source());
+
+  delete m;
+}
+
+
+
+void OSDMonitor::fake_osd_failure(int osd, bool down) 
+{
+  if (down) {
+    dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
+    pending.new_down[osd] = osdmap->osd_inst[osd];
+  } else {
+    dout(1) << "fake_osd_failure OUT osd" << osd << endl;
+    pending.new_out.push_back(osd);
+  }
+  accept_pending();
+  bcast_latest_osd_map_osd();
+  bcast_latest_osd_map_mds();
+}
+
+
+void OSDMonitor::handle_osd_boot(MOSDBoot *m)
+{
+  dout(7) << "osd_boot from " << m->get_source() << endl;
+  assert(m->get_source().is_osd());
+  int from = m->get_source().num();
+
+  if (osdmap->get_epoch() == 0) {
+    // waiting for boot!
+    osdmap->osd_inst[from] = m->get_source_inst();
+
+    if (osdmap->osd_inst.size() == osdmap->osds.size()) {
+      dout(-7) << "osd_boot all osds booted." << endl;
+      osdmap->inc_epoch();
+      osdmap->encode(maps[osdmap->get_epoch()]); // 1
+      pending.epoch = osdmap->get_epoch()+1;     // 2
+
+      send_map();
+      bcast_latest_osd_map_osd();
+      bcast_latest_osd_map_mds();
+    } else {
+      dout(7) << "osd_boot waiting for " 
+              << (osdmap->osds.size() - osdmap->osd_inst.size())
+              << " osds to boot" << endl;
+    }
+    return;
+  }
+
+  // already up?  mark down first?
+  if (osdmap->is_up(from)) {
+    assert(m->get_source_inst() > osdmap->osd_inst[from]);   // this better be newer!  
+      pending.new_down[from] = osdmap->osd_inst[from];
+      accept_pending();
+  }
+  
+  // mark up.
+  pending_out.erase(from);
+  assert(osdmap->is_down(from));
+  pending.new_up[from] = m->get_source_inst();
+  
+  // mark in?
+  if (osdmap->out_osds.count(from)) 
+    pending.new_in.push_back(from);
+  
+  accept_pending();
+
+  // the booting osd will spread word
+  send_incremental_map(m->sb.current_epoch, m->get_source());
+  delete m;
+
+  // tell mds
+  bcast_latest_osd_map_mds();
+}
+
+void OSDMonitor::handle_osd_in(MOSDIn *m)
+{
+  dout(7) << "osd_in from " << m->get_source() << endl;
+  int from = m->get_source().num();
+  if (osdmap->is_out(from)) {
+    pending.new_in.push_back(from);
+    accept_pending();
+    send_incremental_map(m->map_epoch, m->get_source());
+  }
+}
+
+void OSDMonitor::handle_osd_out(MOSDOut *m)
+{
+  dout(7) << "osd_out from " << m->get_source() << endl;
+  int from = m->get_source().num();
+  if (osdmap->is_in(from)) {
+    pending.new_out.push_back(from);
+    accept_pending();
+    send_incremental_map(m->map_epoch, m->get_source());
+  }
+}
+
+
+void OSDMonitor::handle_osd_getmap(MOSDGetMap *m)
+{
+  dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl;
+  
+  if (osdmap->get_epoch() == 0) {
+    awaiting_map[1][m->get_source()] = m->get_since();
+  } else {
+    if (m->get_since())
+      send_incremental_map(m->get_since(), m->get_source());
+    else
+      send_full_map(m->get_source());
+  }
+  delete m;
+}
+
+
+
+void OSDMonitor::accept_pending()
+{
+  dout(-10) << "accept_pending " << osdmap->get_epoch() << " -> " << pending.epoch << endl;
+
+  // accept pending into a new map!
+  pending.encode( inc_maps[ pending.epoch ] );
+  
+  // advance!
+  osdmap->apply_incremental(pending);
+
+  
+  // tell me about it
+  for (map<int,entity_inst_t>::iterator i = pending.new_up.begin();
+       i != pending.new_up.end(); 
+       i++) { 
+    dout(0) << "osd" << i->first << " UP " << i->second << endl;
+    derr(0) << "osd" << i->first << " UP " << i->second << endl;
+    messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+  }
+  for (map<int,entity_inst_t>::iterator i = pending.new_down.begin();
+       i != pending.new_down.end();
+       i++) {
+    dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
+    derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
+    messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+  }
+  for (list<int>::iterator i = pending.new_in.begin();
+       i != pending.new_in.end();
+       i++) {
+    dout(0) << "osd" << *i << " IN" << endl;
+    derr(0) << "osd" << *i << " IN" << endl;
+  }
+  for (list<int>::iterator i = pending.new_out.begin();
+       i != pending.new_out.end();
+       i++) {
+    dout(0) << "osd" << *i << " OUT" << endl;
+    derr(0) << "osd" << *i << " OUT" << endl;
+  }
+
+  // clear new pending
+  OSDMap::Incremental next(osdmap->get_epoch() + 1);
+  pending = next;
+}
+
+void OSDMonitor::send_map()
+{
+  dout(10) << "send_map " << osdmap->get_epoch() << endl;
+
+  map<msg_addr_t,epoch_t> s;
+  s.swap( awaiting_map[osdmap->get_epoch()] );
+  awaiting_map.erase(osdmap->get_epoch());
+
+  for (map<msg_addr_t,epoch_t>::iterator i = s.begin();
+       i != s.end();
+       i++)
+    send_incremental_map(i->second, i->first);
+}
+
+
+void OSDMonitor::send_full_map(msg_addr_t who)
+{
+  messenger->send_message(new MOSDMap(osdmap), who);
+}
+
+void OSDMonitor::send_incremental_map(epoch_t since, msg_addr_t dest)
+{
+  dout(-10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
+           << " to " << dest << endl;
+  
+  MOSDMap *m = new MOSDMap;
+  
+  for (epoch_t e = osdmap->get_epoch();
+       e > since;
+       e--) {
+    bufferlist bl;
+    if (inc_maps.count(e)) {
+      dout(-10) << "send_incremental_map    inc " << e << endl;
+      m->incremental_maps[e] = inc_maps[e];
+    } else if (maps.count(e)) {
+      dout(-10) << "send_incremental_map   full " << e << endl;
+      m->maps[e] = maps[e];
+      //if (!full) break;
+    }
+    else {
+      assert(0);  // we should have all maps.
+    }
+  }
+  
+  messenger->send_message(m, dest);
+}
+
+
+
+void OSDMonitor::bcast_latest_osd_map_mds()
+{
+  epoch_t e = osdmap->get_epoch();
+  dout(1) << "bcast_latest_osd_map_mds epoch " << e << endl;
+  
+  // tell mds
+  for (int i=0; i<g_conf.num_mds; i++) {
+    //send_full_map(MSG_ADDR_MDS(i));
+    send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_MDS(i));
+  }
+}
+
+void OSDMonitor::bcast_latest_osd_map_osd()
+{
+  epoch_t e = osdmap->get_epoch();
+  dout(1) << "bcast_latest_osd_map_osd epoch " << e << endl;
+
+  // tell osds
+  set<int> osds;
+  osdmap->get_all_osds(osds);
+  for (set<int>::iterator it = osds.begin();
+       it != osds.end();
+       it++) {
+    if (osdmap->is_down(*it)) continue;
+
+    send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(*it));
+  }  
+}
+
+
+
+void OSDMonitor::tick()
+{
+  dout(10) << "tick" << endl;
+
+  // mark down osds out?
+  utime_t now = g_clock.now();
+  list<int> mark_out;
+  for (map<int,utime_t>::iterator i = pending_out.begin();
+       i != pending_out.end();
+       i++) {
+    utime_t down = now;
+    down -= i->second;
+
+    if (down.sec() >= g_conf.mon_osd_down_out_interval) {
+      dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl;
+      mark_out.push_back(i->first);
+    }
+  }
+  for (list<int>::iterator i = mark_out.begin();
+       i != mark_out.end();
+       i++) {
+    pending_out.erase(*i);
+    pending.new_out.push_back( *i );
+    accept_pending();
+  }
+  
+  // next!
+  g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
+}
diff --git a/branches/sage/cephmds2/mds/OSDMonitor.h b/branches/sage/cephmds2/mds/OSDMonitor.h
new file mode 100644
index 0000000000000..cd8babc054225
--- /dev/null
+++ b/branches/sage/cephmds2/mds/OSDMonitor.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __OSDMONITOR_H
+#define __OSDMONITOR_H
+
+#include <time.h>
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+
+class OSDMonitor : public Dispatcher {
+  // me
+  int whoami;
+  Messenger *messenger;
+
+  // maps
+  OSDMap *osdmap;
+  map<epoch_t, bufferlist> maps;
+  map<epoch_t, bufferlist> inc_maps;
+
+  OSDMap::Incremental pending;
+
+  map<epoch_t, map<msg_addr_t, epoch_t> > awaiting_map;
+
+  // osd down -> out
+  map<int,utime_t>  pending_out;
+
+  
+  void tick();  // check state, take actions
+
+  // maps
+  void accept_pending();   // accept pending, new map.
+  void send_map();         // send current map to waiters.
+  void send_full_map(msg_addr_t dest);
+  void send_incremental_map(epoch_t since, msg_addr_t dest);
+  void bcast_latest_osd_map_mds();
+  void bcast_latest_osd_map_osd();
+
+
+ public:
+  OSDMonitor(int w, Messenger *m) : 
+    whoami(w),
+    messenger(m),
+    osdmap(0) {
+  }
+
+  void init();
+
+  void dispatch(Message *m);
+  void handle_shutdown(Message *m);
+
+  void handle_osd_boot(class MOSDBoot *m);
+  void handle_osd_in(class MOSDIn *m);
+  void handle_osd_out(class MOSDOut *m);
+  void handle_osd_failure(class MOSDFailure *m);
+  void handle_osd_getmap(class MOSDGetMap *m);
+
+  void handle_ping_ack(class MPingAck *m);
+
+  // hack
+  void fake_osd_failure(int osd, bool down);
+  void fake_osdmap_update();
+  void fake_reorg();
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/Renamer.cc b/branches/sage/cephmds2/mds/Renamer.cc
new file mode 100644
index 0000000000000..dfea8d6336803
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Renamer.cc
@@ -0,0 +1,915 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDCache.h"
+#include "MDStore.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDS.h"
+#include "MDSMap.h"
+#include "MDLog.h"
+#include "AnchorClient.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "include/filepath.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+
+#include "messages/MRenameWarning.h"
+#include "messages/MRenameNotify.h"
+#include "messages/MRenameNotifyAck.h"
+#include "messages/MRename.h"
+#include "messages/MRenameAck.h"
+#include "messages/MRenameReq.h"
+#include "messages/MRenamePrep.h"
+
+
+
+void Renamer::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_MDS_RENAMEWARNING:
+    handle_rename_warning((MRenameWarning*)m);
+    break;
+  case MSG_MDS_RENAMENOTIFY:
+    handle_rename_notify((MRenameNotify*)m);
+    break;
+  case MSG_MDS_RENAMENOTIFYACK:
+    handle_rename_notify_ack((MRenameNotifyAck*)m);
+    break;
+  case MSG_MDS_RENAME:
+    handle_rename((MRename*)m);
+    break;
+  case MSG_MDS_RENAMEREQ:
+    handle_rename_req((MRenameReq*)m);
+    break;
+  case MSG_MDS_RENAMEPREP:
+    handle_rename_prep((MRenamePrep*)m);
+    break;
+  case MSG_MDS_RENAMEACK:
+    handle_rename_ack((MRenameAck*)m);
+    break;
+
+  default:
+	assert(0);
+  }
+}
+
+
+// renaming!
+
+
+/*
+ fix_renamed_dir():
+
+ caller has already:
+   - relinked inode in new location
+   - fixed in->is_auth()
+   - set dir_auth, if appropriate
+
+ caller has not:
+   - touched in->dir
+   - updated import/export tables
+*/
+void Renamer::fix_renamed_dir(CDir *srcdir,
+                              CInode *in,
+                              CDir *destdir,
+                              bool authchanged,   // _inode_ auth
+                              int dir_auth)        // dir auth (for certain cases)
+{
+  dout(7) << "fix_renamed_dir on " << *in << endl;
+  dout(7) << "fix_renamed_dir on " << *in->dir << endl;
+
+  if (in->dir->is_auth()) {
+    // dir ours
+    dout(7) << "dir is auth" << endl;
+    assert(!in->dir->is_export());
+
+    if (in->is_auth()) {
+      // inode now ours
+
+      if (authchanged) {
+        // inode _was_ replica, now ours
+        dout(7) << "inode was replica, now ours.  removing from import list." << endl;
+        assert(in->dir->is_import());
+        
+        // not import anymore!
+        cache->imports.erase(in->dir);
+        in->dir->state_clear(CDIR_STATE_IMPORT);
+        in->dir->put(CDIR_PIN_IMPORT);
+
+        in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+        dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+
+        // move my nested imports to in's containing import
+        CDir *con = cache->get_auth_container(in->dir);
+        assert(con);
+        for (set<CDir*>::iterator p = cache->nested_exports[in->dir].begin();
+             p != cache->nested_exports[in->dir].end();
+             p++) {
+          dout(7) << "moving nested export under new container " << *con << endl;
+          cache->nested_exports[con].insert(*p);
+        }
+        cache->nested_exports.erase(in->dir);
+        
+      } else {
+        // inode was ours, still ours.
+        dout(7) << "inode was ours, still ours." << endl;
+        assert(!in->dir->is_import());
+        assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+        
+        // move any exports nested beneath me?
+        CDir *newcon = cache->get_auth_container(in->dir);
+        assert(newcon);
+        CDir *oldcon = cache->get_auth_container(srcdir);
+        assert(oldcon);
+        if (newcon != oldcon) {
+          dout(7) << "moving nested exports under new container" << endl;
+          set<CDir*> nested;
+          cache->find_nested_exports_under(oldcon, in->dir, nested);
+          for (set<CDir*>::iterator it = nested.begin();
+               it != nested.end();
+               it++) {
+            dout(7) << "moving nested export " << *it << " under new container" << endl;
+            cache->nested_exports[oldcon].erase(*it);
+            cache->nested_exports[newcon].insert(*it);
+          }
+        }
+      }
+
+    } else {
+      // inode now replica
+
+      if (authchanged) {
+        // inode was ours, but now replica
+        dout(7) << "inode was ours, now replica.  adding to import list." << endl;
+
+        // i am now an import
+        cache->imports.insert(in->dir);
+        in->dir->state_set(CDIR_STATE_IMPORT);
+        in->dir->get(CDIR_PIN_IMPORT);
+
+        in->dir->set_dir_auth( mds->get_nodeid() );
+        dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+
+        // find old import
+        CDir *oldcon = cache->get_auth_container(srcdir);
+        assert(oldcon);
+        dout(7) << " oldcon is " << *oldcon << endl;
+
+        // move nested exports under me 
+        set<CDir*> nested;
+        cache->find_nested_exports_under(oldcon, in->dir, nested);  
+        for (set<CDir*>::iterator it = nested.begin();
+             it != nested.end();
+             it++) {
+          dout(7) << "moving nested export " << *it << " under me" << endl;
+          cache->nested_exports[oldcon].erase(*it);
+          cache->nested_exports[in->dir].insert(*it);
+        }
+
+      } else {
+        // inode was replica, still replica
+        dout(7) << "inode was replica, still replica.  doing nothing." << endl;
+        assert(in->dir->is_import());
+
+        // verify dir_auth
+        assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir.
+        assert(in->authority() != in->dir->get_dir_auth());   // inode not me.
+      }
+
+      assert(in->dir->is_import());
+    }
+
+  } else {
+    // dir is not ours
+    dout(7) << "dir is not auth" << endl;
+
+    if (in->is_auth()) {
+      // inode now ours
+
+      if (authchanged) {
+        // inode was replica, now ours
+        dout(7) << "inode was replica, now ours.  now an export." << endl;
+        assert(!in->dir->is_export());
+        
+        // now export
+        cache->exports.insert(in->dir);
+        in->dir->state_set(CDIR_STATE_EXPORT);
+        in->dir->get(CDIR_PIN_EXPORT);
+        
+        assert(dir_auth >= 0);  // better be defined
+        in->dir->set_dir_auth( dir_auth );
+        dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+        
+        CDir *newcon = cache->get_auth_container(in->dir);
+        assert(newcon);
+        cache->nested_exports[newcon].insert(in->dir);
+
+      } else {
+        // inode was ours, still ours
+        dout(7) << "inode was ours, still ours.  did my import change?" << endl;
+
+        // sanity
+        assert(in->dir->is_export());
+        assert(in->dir->get_dir_auth() >= 0);              
+        assert(in->dir->get_dir_auth() != in->authority());
+
+        // moved under new import?
+        CDir *oldcon = cache->get_auth_container(srcdir);
+        CDir *newcon = cache->get_auth_container(in->dir);
+        if (oldcon != newcon) {
+          dout(7) << "moving myself under new import " << *newcon << endl;
+          cache->nested_exports[oldcon].erase(in->dir);
+          cache->nested_exports[newcon].insert(in->dir);
+        }
+      }
+
+      assert(in->dir->is_export());
+    } else {
+      // inode now replica
+
+      if (authchanged) {
+        // inode was ours, now replica
+        dout(7) << "inode was ours, now replica.  removing from export list." << endl;
+        assert(in->dir->is_export());
+
+        // remove from export list
+        cache->exports.erase(in->dir);
+        in->dir->state_clear(CDIR_STATE_EXPORT);
+        in->dir->put(CDIR_PIN_EXPORT);
+        
+        CDir *oldcon = cache->get_auth_container(srcdir);
+        assert(oldcon);
+        assert(cache->nested_exports[oldcon].count(in->dir) == 1);
+        cache->nested_exports[oldcon].erase(in->dir);
+
+        // simplify dir_auth
+        if (in->authority() == in->dir->authority()) {
+          in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+          dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl;
+        } else {
+          assert(in->dir->get_dir_auth() >= 0);    // someone else's export,
+        }
+
+      } else {
+        // inode was replica, still replica
+        dout(7) << "inode was replica, still replica.  do nothing." << endl;
+        
+        // fix dir_auth?
+        if (in->authority() == dir_auth)
+          in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+        else
+          in->dir->set_dir_auth( dir_auth );
+        dout(7) << " fixing dir_auth to be " << dir_auth << endl;
+
+        // do nothing.
+      }
+      
+      assert(!in->dir->is_export());
+    }  
+  }
+
+  cache->show_imports();
+}
+
+/*
+ * when initiator gets an ack back for a foreign rename
+ */
+
+class C_MDC_RenameNotifyAck : public Context {
+  Renamer *rn;
+  CInode *in;
+  int initiator;
+
+public:
+  C_MDC_RenameNotifyAck(Renamer *r, 
+	CInode *i, int init) : rn(r), in(i), initiator(init) {}
+  void finish(int r) {
+    rn->file_rename_ack(in, initiator);
+  }
+};
+
+
+
+/************** initiator ****************/
+
+/*
+ * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.)
+ */
+class C_MDC_RenameAck : public Context {
+  Renamer *mdc;
+  CDir *srcdir;
+  CInode *in;
+  Context *c;
+public:
+  C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) {
+    this->mdc = mdc;
+    this->srcdir = srcdir;
+    this->in = in;
+    this->c = c;
+  }
+  void finish(int r) {
+    mdc->file_rename_finish(srcdir, in, c);
+  }
+};
+
+
+void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish)
+{
+  assert(srcdn->is_xlocked());  // by me
+  assert(destdn->is_xlocked());  // by me
+
+  CDir *srcdir = srcdn->dir;
+  string srcname = srcdn->name;
+  
+  CDir *destdir = destdn->dir;
+  string destname = destdn->name;
+
+  CInode *in = srcdn->inode;
+  //Message *req = srcdn->xlockedby;
+
+
+  // determine the players
+  int srcauth = srcdir->dentry_authority(srcdn->name);
+  int destauth = destdir->dentry_authority(destname);
+
+
+  // FOREIGN rename?
+  if (srcauth != mds->get_nodeid() ||
+      destauth != mds->get_nodeid()) {
+    dout(7) << "foreign rename.  srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl;
+    
+    string destpath;
+    destdn->make_path(destpath);
+
+    if (destauth != mds->get_nodeid()) { 
+      // make sure dest has dir open.
+      dout(7) << "file_rename i'm not dest auth.  sending MRenamePrep to " << destauth << endl;
+      
+      // prep dest first, they must have the dir open!  rest will follow.
+      string srcpath;
+      srcdn->make_path(srcpath);
+      
+      MRenamePrep *m = new MRenamePrep(mds->get_nodeid(),  // i'm the initiator
+                                       srcdir->ino(), srcname, srcpath, 
+                                       destdir->ino(), destname, destpath,
+                                       srcauth);  // tell dest who src is (maybe even me)
+      mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
+      
+      cache->show_imports();
+      
+    }
+    
+    else if (srcauth != mds->get_nodeid()) {
+      if (destauth == mds->get_nodeid()) {
+        dout(7) << "file_rename dest auth, not src auth.  sending MRenameReq" << endl;    
+      } else {
+        dout(7) << "file_rename neither src auth nor dest auth.  sending MRenameReq" << endl;    
+      }
+      
+      // srcdn not important on destauth, just request
+      MRenameReq *m = new MRenameReq(mds->get_nodeid(),  // i'm the initiator
+                                     srcdir->ino(), srcname, 
+                                     destdir->ino(), destname, destpath, destauth);  // tell src who dest is (they may not know)
+      mds->send_message_mds(m, srcauth, MDS_PORT_CACHE);
+    }
+    
+    else
+      assert(0);
+
+    // set waiter on the inode (is this the best place?)
+    in->add_waiter(CINODE_WAIT_RENAMEACK, 
+                   new C_MDC_RenameAck(this, 
+                                       srcdir, in, onfinish));
+    return;
+  }
+
+  // LOCAL rename!
+  assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid());
+  dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl;
+  
+  // update our cache
+  if (destdn->inode && destdn->inode->is_dirty())
+    destdn->inode->mark_clean();
+
+  cache->rename_file(srcdn, destdn);
+  
+  // update imports/exports?
+  if (in->is_dir() && in->dir) 
+    fix_renamed_dir(srcdir, in, destdir, false);  // auth didnt change
+
+  // mark dentries dirty
+  srcdn->mark_dirty();
+  destdn->mark_dirty();
+  in->mark_dirty();
+ 
+ 
+  // local, restrict notify to ppl with open dirs
+  set<int> notify = srcdir->get_open_by();
+  for (set<int>::iterator it = destdir->open_by_begin();
+       it != destdir->open_by_end();
+       it++)
+    if (notify.count(*it) == 0) notify.insert(*it);
+  
+  if (notify.size()) {
+    // warn + notify
+    file_rename_warn(in, notify);
+    file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid());
+
+    // wait for MRenameNotifyAck's
+    in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK,
+                   new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid()));  // i am initiator
+
+    // wait for finish
+    in->add_waiter(CINODE_WAIT_RENAMEACK,
+                   new C_MDC_RenameAck(this, srcdir, in, onfinish));
+  } else {
+    // sweet, no notify necessary, we're done!
+    file_rename_finish(srcdir, in, onfinish);
+  }
+}
+
+void Renamer::handle_rename_ack(MRenameAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  
+  dout(7) << "handle_rename_ack on " << *in << endl;
+
+  // all done!
+  in->finish_waiting(CINODE_WAIT_RENAMEACK);
+
+  delete m;
+}
+
+void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c)
+{
+  dout(10) << "file_rename_finish on " << *in << endl;
+
+  // did i empty out an imported dir?  FIXME this check should go somewhere else???
+  if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0) 
+    cache->migrator->export_empty_import(srcdir);
+
+  // finish our caller
+  if (c) {
+    c->finish(0);
+    delete c;
+  }
+}
+
+
+/************* src **************/
+
+
+/** handle_rename_req
+ * received by auth of src dentry (from init, or destauth if dir).  
+ * src may not have dest dir open.
+ * src will export inode, unlink|rename, and send MRename to dest.
+ */
+void Renamer::handle_rename_req(MRenameReq *m)
+{
+  // i am auth, i will have it.
+  CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+  CDir *srcdir = srcdiri->dir;
+  CDentry *srcdn = srcdir->lookup(m->get_srcname());
+  assert(srcdn);
+  
+  // do it
+  file_rename_foreign_src(srcdn, 
+                          m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), 
+                          m->get_initiator());
+  delete m;
+}
+
+
+void Renamer::file_rename_foreign_src(CDentry *srcdn, 
+                                      inodeno_t destdirino, string& destname, string& destpath, int destauth, 
+                                      int initiator)
+{
+  dout(7) << "file_rename_foreign_src " << *srcdn << endl;
+
+  CDir *srcdir = srcdn->dir;
+  string srcname = srcdn->name;
+
+  // (we're basically exporting this inode)
+  CInode *in = srcdn->inode;
+  assert(in);
+  assert(in->is_auth());
+
+  if (in->is_dir()) cache->show_imports();
+
+  // encode and export inode state
+  bufferlist inode_state;
+  cache->migrator->encode_export_inode(in, inode_state, destauth);
+
+  // send
+  MRename *m = new MRename(initiator,
+                           srcdir->ino(), srcdn->name, destdirino, destname,
+                           inode_state);
+  mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
+  
+  // have dest?
+  CInode *destdiri = cache->get_inode(m->get_destdirino());
+  CDir *destdir = 0;
+  if (destdiri) destdir = destdiri->dir;
+  CDentry *destdn = 0;
+  if (destdir) destdn = destdir->lookup(m->get_destname());
+
+  // discover src
+  if (!destdn) {
+    dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl;
+
+    filepath destfilepath = destpath;
+    vector<CDentry*> trace;
+    int r = cache->path_traverse(destfilepath, trace, true,
+								 m, new C_MDS_RetryMessage(mds, m), 
+								 MDS_TRAVERSE_DISCOVER);
+    assert(r>0);
+    return;
+  }
+
+  assert(destdn);
+
+  // update our cache
+  cache->rename_file(srcdn, destdn);
+  
+  // update imports/exports?
+  if (in->is_dir() && in->dir) 
+    fix_renamed_dir(srcdir, in, destdir, true);  // auth changed
+
+  srcdn->mark_dirty();
+
+  // proxy!
+  in->state_set(CINODE_STATE_PROXY);
+  in->get(CINODE_PIN_PROXY);
+  
+  // generate notify list (everybody but src|dst) and send warnings
+  set<int> notify;
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i != mds->get_nodeid() &&  // except the source
+        i != destauth)             // and the dest
+      notify.insert(i);
+  }
+  file_rename_warn(in, notify);
+
+
+  // wait for MRenameNotifyAck's
+  in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK,
+                 new C_MDC_RenameNotifyAck(this, in, initiator));
+}
+
+void Renamer::file_rename_warn(CInode *in,
+                               set<int>& notify)
+{
+  // note gather list
+  rename_waiting_for_ack[in->ino()] = notify;
+
+  // send
+  for (set<int>::iterator it = notify.begin();
+       it != notify.end();
+       it++) {
+    dout(10) << "file_rename_warn to " << *it << " for " << *in << endl;
+    mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE);
+  }
+}
+
+
+void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m)
+{
+  CInode *in = cache->get_inode(m->get_ino());
+  assert(in);
+  dout(7) << "handle_rename_notify_ack on " << *in << endl;
+
+  int source = MSG_ADDR_NUM(m->get_source());
+  rename_waiting_for_ack[in->ino()].erase(source);
+  if (rename_waiting_for_ack[in->ino()].empty()) {
+    // last one!
+	rename_waiting_for_ack.erase(in->ino());
+    in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0);
+  } else {
+    dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl;
+  }
+}
+
+
+void Renamer::file_rename_ack(CInode *in, int initiator) 
+{
+  // we got all our MNotifyAck's.
+
+  // was i proxy (if not, it's cuz this was a local rename)
+  if (in->state_test(CINODE_STATE_PROXY)) {
+    dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl;
+    in->state_clear(CINODE_STATE_PROXY);
+    in->put(CINODE_PIN_PROXY);
+  }
+
+  // done!
+  if (initiator == mds->get_nodeid()) {
+    // it's me, finish
+    dout(7) << "file_rename_ack i am initiator, finishing" << endl;
+    in->finish_waiting(CINODE_WAIT_RENAMEACK);
+  } else {
+    // send ack
+    dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl;
+    mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE);
+  }  
+}
+
+
+
+
+/************ dest *************/
+
+/** handle_rename_prep
+ * received by auth of dest dentry to make sure they have src + dir open.
+ * this is so that when they get the inode and dir, they can update exports etc properly.
+ * will send MRenameReq to src.
+ */
+void Renamer::handle_rename_prep(MRenamePrep *m)
+{
+  // open src
+  filepath srcpath = m->get_srcpath();
+  vector<CDentry*> trace;
+  int r = cache->path_traverse(srcpath, trace, false,
+							   m, new C_MDS_RetryMessage(mds, m), 
+							   MDS_TRAVERSE_DISCOVER);
+
+  if (r>0) return;
+
+  // ok!
+  CInode *srcin = trace[trace.size()-1]->inode;
+  assert(srcin);
+  
+  dout(7) << "handle_rename_prep have srcin " << *srcin << endl;
+
+  if (srcin->is_dir()) {
+    if (!srcin->dir) {
+      dout(7) << "handle_rename_prep need to open dir" << endl;
+      cache->open_remote_dir(srcin,
+							 new C_MDS_RetryMessage(mds,m));
+      return;
+    }
+
+    dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl;    
+  }
+
+  // pin
+  srcin->get(CINODE_PIN_RENAMESRC);
+
+  // send rename request
+  MRenameReq *req = new MRenameReq(m->get_initiator(),  // i'm the initiator
+                                   m->get_srcdirino(), m->get_srcname(), 
+                                   m->get_destdirino(), m->get_destname(), m->get_destpath(),
+                                   mds->get_nodeid());  // i am dest
+  mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE);
+  delete m;
+  return;
+}
+
+
+
+/** handle_rename
+ * received by auth of dest dentry.   includes exported inode info.
+ * dest may not have srcdir open.
+ */
+void Renamer::handle_rename(MRename *m)
+{
+  // srcdn (required)
+  CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+  CDir *srcdir = srcdiri->dir;
+  CDentry *srcdn = srcdir->lookup(m->get_srcname());
+  string srcname = srcdn->name;
+  assert(srcdn && srcdn->inode);
+
+  dout(7) << "handle_rename srcdn " << *srcdn << endl;
+
+  // destdn (required).  i am auth, so i will have it.
+  CInode *destdiri = cache->get_inode(m->get_destdirino());
+  CDir *destdir = destdiri->dir;
+  CDentry *destdn = destdir->lookup(m->get_destname());
+  string destname = destdn->name;
+  assert(destdn);
+  
+  dout(7) << "handle_rename destdn " << *destdn << endl;
+
+  // note old dir auth
+  int old_dir_auth = -1;
+  if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority();
+    
+  // rename replica into position
+  if (destdn->inode && destdn->inode->is_dirty())
+    destdn->inode->mark_clean();
+
+  cache->rename_file(srcdn, destdn);
+
+  // decode + import inode (into new location start)
+  int off = 0;
+  // HACK
+  bufferlist bufstate;
+  bufstate.claim_append(m->get_inode_state());
+  cache->migrator->decode_import_inode(destdn, bufstate, off, MSG_ADDR_NUM(m->get_source()));
+
+  CInode *in = destdn->inode;
+  assert(in);
+
+  // update imports/exports?
+  if (in->is_dir()) {
+    assert(in->dir);  // i had better already ahve it open.. see MRenamePrep
+    fix_renamed_dir(srcdir, in, destdir, true,  // auth changed
+                    old_dir_auth);              // src is possibly new dir auth.
+  }
+  
+  // mark dirty
+  destdn->mark_dirty();
+  in->mark_dirty();
+
+  // unpin
+  in->put(CINODE_PIN_RENAMESRC);
+
+  // ok, send notifies.
+  set<int> notify;
+  for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+    if (i != MSG_ADDR_NUM(m->get_source()) &&  // except the source
+        i != mds->get_nodeid())  // and the dest
+      notify.insert(i);
+  }
+  file_rename_notify(in, srcdir, srcname, destdir, destname, notify, MSG_ADDR_NUM(m->get_source()));
+
+  delete m;
+}
+
+
+void Renamer::file_rename_notify(CInode *in, 
+                                 CDir *srcdir, string& srcname, CDir *destdir, string& destname,
+                                 set<int>& notify,
+                                 int srcauth)
+{
+  /* NOTE: notify list might include myself */
+  
+  // tell
+  string destdirpath;
+  destdir->inode->make_path(destdirpath);
+  
+  for (set<int>::iterator it = notify.begin();
+       it != notify.end();
+       it++) {
+    dout(10) << "file_rename_notify to " << *it << " for " << *in << endl;
+    mds->send_message_mds(new MRenameNotify(in->ino(),
+					    srcdir->ino(),
+					    srcname,
+					    destdir->ino(),
+					    destdirpath,
+					    destname,
+					    srcauth),
+			  *it, MDS_PORT_CACHE);
+  }
+}
+
+
+
+/************** bystanders ****************/
+
+void Renamer::handle_rename_warning(MRenameWarning *m)
+{
+  // add to warning list
+  stray_rename_warnings.insert( m->get_ino() );
+  
+  // did i already see the notify?
+  if (stray_rename_notifies.count(m->get_ino())) {
+    // i did, we're good.
+    dout(7) << "handle_rename_warning on " << m->get_ino() << ".  already got notify." << endl;
+    
+    handle_rename_notify(stray_rename_notifies[m->get_ino()]);
+    stray_rename_notifies.erase(m->get_ino());
+  } else {
+    dout(7) << "handle_rename_warning on " << m->get_ino() << ".  waiting for notify." << endl;
+  }
+  
+  // done
+  delete m;
+}
+
+
+void Renamer::handle_rename_notify(MRenameNotify *m)
+{
+  // FIXME: when we do hard links, i think we need to 
+  // have srcdn and destdn both, or neither,  always!
+
+  // did i see the warning yet?
+  if (!stray_rename_warnings.count(m->get_ino())) {
+    // wait for it.
+    dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl;
+    stray_rename_notifies[m->get_ino()] = m;
+    return;
+  }
+
+  dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl;
+  
+  // src
+  CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+  CDir *srcdir = 0;
+  if (srcdiri) srcdir = srcdiri->dir;
+  CDentry *srcdn = 0;
+  if (srcdir) srcdn = srcdir->lookup(m->get_srcname());
+
+  // dest
+  CInode *destdiri = cache->get_inode(m->get_destdirino());
+  CDir *destdir = 0;
+  if (destdiri) destdir = destdiri->dir;
+  CDentry *destdn = 0;
+  if (destdir) destdn = destdir->lookup(m->get_destname());
+
+  // have both?
+  list<Context*> finished;
+  if (srcdn && destdir) {
+    CInode *in = srcdn->inode;
+
+    int old_dir_auth = -1;
+    if (in && in->dir) old_dir_auth = in->dir->authority();
+
+    if (!destdn) {
+      destdn = destdir->add_dentry(m->get_destname());  // create null dentry
+      destdn->lockstate = DN_LOCK_XLOCK;                // that's xlocked!
+    }
+
+    dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl;
+    
+    if (in) {
+      cache->rename_file(srcdn, destdn);
+
+      // update imports/exports?
+      if (in && in->is_dir() && in->dir) {
+        fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth);  // auth didnt change
+      }
+    } else {
+      dout(7) << " i don't have the inode (just null dentries)" << endl;
+    }
+    
+  }
+
+  else if (srcdn) {
+    dout(7) << "handle_rename_notify no dest, but have src" << endl;
+    dout(7) << "srcdn is " << *srcdn << endl;
+
+    if (destdiri) {
+      dout(7) << "have destdiri, opening dir " << *destdiri << endl;
+      cache->open_remote_dir(destdiri,
+							 new C_MDS_RetryMessage(mds,m));
+    } else {
+      filepath destdirpath = m->get_destdirpath();
+      dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl;
+      
+      vector<CDentry*> trace;
+      int r = cache->path_traverse(destdirpath, trace, true,
+								   m, new C_MDS_RetryMessage(mds, m), 
+								   MDS_TRAVERSE_DISCOVER);
+      assert(r>0);
+    }
+    return;
+  }
+
+  else if (destdn) {
+    dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl;
+    if (destdn->inode) {
+      destdir->unlink_inode(destdn);
+    }
+  }
+  
+  else {
+    dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl;
+    assert(srcdn == 0 && destdn == 0);
+  }
+  
+  mds->queue_finished(finished);
+
+
+  // ack
+  dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl;
+  MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino());
+  mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE);
+  
+
+  stray_rename_warnings.erase( m->get_ino() );
+  delete m;
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/mds/Renamer.h b/branches/sage/cephmds2/mds/Renamer.h
new file mode 100644
index 0000000000000..1005971df986f
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Renamer.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_RENAMER_H
+#define __MDS_RENAMER_H
+
+#include "include/types.h"
+
+#include <map>
+#include <set>
+using std::map;
+using std::set;
+
+class MDS;
+class MDCache;
+class CDentry;
+class CInode;
+class CDir;
+
+class Message;
+class MRenameWarning;
+class MRenameNotify;
+class MRenameNotifyAck;
+class MRename;
+class MRenamePrep;
+class MRenameReq;
+class MRenameAck;
+
+class Renamer {
+  MDS *mds;
+  MDCache *cache;
+
+  // rename fun
+  set<inodeno_t>                    stray_rename_warnings; // notifies i haven't seen
+  map<inodeno_t, MRenameNotify*>    stray_rename_notifies;
+
+  map<inodeno_t, set<int> >         rename_waiting_for_ack;
+
+
+
+  void fix_renamed_dir(CDir *srcdir,
+                       CInode *in,
+                       CDir *destdir,
+                       bool authchanged,   // _inode_ auth changed
+                       int dirauth=-1);    // dirauth (for certain cases)
+  
+
+public:
+  Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {}
+  
+  void dispatch(Message *m);
+
+  // RENAME
+  // initiator
+ public:
+  void file_rename(CDentry *srcdn, CDentry *destdn, Context *c);
+ protected:
+  void handle_rename_ack(MRenameAck *m);              // dest -> init (almost always)
+  void file_rename_finish(CDir *srcdir, CInode *in, Context *c);
+  friend class C_MDC_RenameAck;
+
+  // src
+  void handle_rename_req(MRenameReq *m);              // dest -> src
+  void file_rename_foreign_src(CDentry *srcdn, 
+                               inodeno_t destdirino, string& destname, string& destpath, int destauth, 
+                               int initiator);
+  void file_rename_warn(CInode *in, set<int>& notify);
+  void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src
+  void file_rename_ack(CInode *in, int initiator);
+  friend class C_MDC_RenameNotifyAck;
+
+  // dest
+  void handle_rename_prep(MRenamePrep *m);            // init -> dest
+  void handle_rename(MRename *m);                     // src -> dest
+  void file_rename_notify(CInode *in, 
+                          CDir *srcdir, string& srcname, CDir *destdir, string& destname,
+                          set<int>& notify, int srcauth);
+
+  // bystander
+  void handle_rename_warning(MRenameWarning *m);      // src -> bystanders
+  void handle_rename_notify(MRenameNotify *m);        // dest -> bystanders
+
+
+};
+
+#endif
+
+
diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc
new file mode 100644
index 0000000000000..28ebb826e1a3a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Server.cc
@@ -0,0 +1,2151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "Renamer.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+#include "messages/MHashReaddir.h"
+#include "messages/MHashReaddirReply.h"
+
+#include "messages/MLock.h"
+
+#include "messages/MInodeLink.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+
+#include "include/filepath.h"
+#include "common/Timer.h"
+#include "common/Logger.h"
+#include "common/LogType.h"
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server "
+#define  derr(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server "
+
+
+void Server::dispatch(Message *m) 
+{
+  // active?
+  if (!mds->is_active()) {
+    dout(3) << "not active yet, waiting" << endl;
+    mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  switch (m->get_type()) {
+  case MSG_CLIENT_MOUNT:
+    handle_client_mount((MClientMount*)m);
+    return;
+  case MSG_CLIENT_UNMOUNT:
+    handle_client_unmount(m);
+    return;
+  }
+ 
+
+  switch (m->get_type()) {
+  case MSG_CLIENT_REQUEST:
+    handle_client_request((MClientRequest*)m);
+    return;
+
+  case MSG_MDS_HASHREADDIR:
+    handle_hash_readdir((MHashReaddir*)m);
+    return;
+  case MSG_MDS_HASHREADDIRREPLY:
+    handle_hash_readdir_reply((MHashReaddirReply*)m);
+    return;
+    
+  }
+
+  dout(1) << " main unknown message " << m->get_type() << endl;
+  assert(0);
+}
+
+
+
+
+
+void Server::handle_client_mount(MClientMount *m)
+{
+  int n = MSG_ADDR_NUM(m->get_source());
+  dout(3) << "mount by client" << n << endl;
+  mds->clientmap.add_mount(n, m->get_source_inst());
+
+  assert(whoami == 0);  // mds0 mounts/unmounts
+
+  // ack
+  messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), 
+                          m->get_source(), m->get_source_inst());
+  delete m;
+}
+
+void Server::handle_client_unmount(Message *m)
+{
+  int n = MSG_ADDR_NUM(m->get_source());
+  dout(3) << "unmount by client" << n << endl;
+
+  assert(whoami == 0);  // mds0 mounts/unmounts
+
+  mds->clientmap.rem_mount(n);
+
+  if (mds->clientmap.get_mount_set().empty()) {
+    dout(3) << "all clients done, initiating shutdown" << endl;
+    mds->shutdown_start();
+  }
+
+  // ack by sending back to client
+  entity_inst_t srcinst = m->get_source_inst();  // make a copy!
+  messenger->send_message(m, m->get_source(), srcinst);
+}
+
+
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+
+/** C_MDS_CommitRequest
+ */
+
+class C_MDS_CommitRequest : public Context {
+  Server *server;
+  MClientRequest *req;
+  MClientReply *reply;
+  CInode *tracei;    // inode to include a trace for
+  LogEvent *event;
+
+public:
+  C_MDS_CommitRequest(Server *server,
+                      MClientRequest *req, MClientReply *reply, CInode *tracei, 
+                      LogEvent *event=0) {
+    this->server = server;
+    this->req = req;
+    this->tracei = tracei;
+    this->reply = reply;
+    this->event = event;
+  }
+  void finish(int r) {
+    if (r != 0) {
+      // failure.  set failure code and reply.
+      reply->set_result(r);
+    }
+    if (event) {
+      server->commit_request(req, reply, tracei, event);
+    } else {
+      // reply.
+      server->reply_request(req, reply, tracei);
+    }
+  }
+};
+
+
+/*
+ * send generic response (just and error code)
+ */
+void Server::reply_request(MClientRequest *req, int r, CInode *tracei)
+{
+  reply_request(req, new MClientReply(req, r), tracei);
+}
+
+
+/*
+ * send given reply
+ * include a trace to tracei
+ */
+void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) {
+  dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl;
+
+  // include trace
+  if (tracei) {
+    reply->set_trace_dist( tracei, whoami );
+  }
+  
+  // send reply
+  messenger->send_message(reply,
+                          MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+  // discard request
+  mdcache->request_finish(req);
+
+  // stupid stats crap (FIXME)
+  stat_ops++;
+}
+
+
+/* 
+ * commit event(s) to the metadata journal, then reply.
+ * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply)
+ */
+void Server::commit_request(MClientRequest *req,
+                         MClientReply *reply,
+                         CInode *tracei,
+                         LogEvent *event,
+                         LogEvent *event2) 
+{      
+  // log
+  if (event) mdlog->submit_entry(event);
+  if (event2) mdlog->submit_entry(event2);
+  
+  if (g_conf.mds_log_before_reply && g_conf.mds_log && event) {
+    // SAFE mode!
+
+    // pin inode so it doesn't go away!
+    if (tracei) mdcache->request_pin_inode(req, tracei);
+
+    // wait for log sync
+    mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); 
+    return;
+  }
+  else {
+    // just reply
+    reply_request(req, reply, tracei);
+  }
+}
+
+
+
+/***
+ * process a client request
+ */
+
+void Server::handle_client_request(MClientRequest *req)
+{
+  dout(4) << "req " << *req << endl;
+
+  // note original client addr
+  if (req->get_source().is_client()) {
+    req->set_client_inst( req->get_source_inst() );
+    req->clear_payload();
+  }
+
+  if (!mds->is_active()) {
+    dout(5) << " not active, discarding client request." << endl;
+    delete req;
+    return;
+  }
+  
+  if (!mdcache->get_root()) {
+    dout(5) << "need to open root" << endl;
+    mdcache->open_root(new C_MDS_RetryMessage(mds, req));
+    return;
+  }
+
+  // okay, i want
+  CInode           *ref = 0;
+  vector<CDentry*> trace;      // might be blank, for fh guys
+
+  bool follow_trailing_symlink = false;
+
+  // operations on fh's or other non-files
+  switch (req->get_op()) {
+    /*
+  case MDS_OP_FSTAT:
+    reply = handle_client_fstat(req, cur);
+    break; ****** fiX ME ***
+    */
+    
+  case MDS_OP_TRUNCATE:
+    if (!req->get_ino()) break;   // can be called w/ either fh OR path
+    
+  case MDS_OP_RELEASE:
+  case MDS_OP_FSYNC:
+    ref = mdcache->get_inode(req->get_ino());   // fixme someday no ino needed?
+
+    if (!ref) {
+      int next = whoami + 1;
+      if (next >= mds->mdsmap->get_num_mds()) next = 0;
+      dout(10) << "got request on ino we don't have, passing buck to " << next << endl;
+      mds->send_message_mds(req, next, MDS_PORT_SERVER);
+      return;
+    }
+  }
+
+  if (!ref) {
+    // we need to traverse a path
+    filepath refpath = req->get_filepath();
+    
+    // ops on non-existing files --> directory paths
+    switch (req->get_op()) {
+    case MDS_OP_OPEN:
+      if (!(req->get_iarg() & O_CREAT)) break;
+      
+    case MDS_OP_MKNOD:
+    case MDS_OP_MKDIR:
+    case MDS_OP_SYMLINK:
+    case MDS_OP_LINK:
+    case MDS_OP_UNLINK:   // also wrt parent dir, NOT the unlinked inode!!
+    case MDS_OP_RMDIR:
+    case MDS_OP_RENAME:
+      // remove last bit of path
+      refpath = refpath.prefixpath(refpath.depth()-1);
+      break;
+    }
+    dout(10) << "refpath = " << refpath << endl;
+    
+    Context *ondelay = new C_MDS_RetryMessage(mds, req);
+    
+    if (req->get_op() == MDS_OP_LSTAT) {
+      follow_trailing_symlink = false;
+    }
+
+    // do trace
+    int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink,
+                                   req, ondelay,
+                                   MDS_TRAVERSE_FORWARD,
+                                   0,
+                                   true); // is MClientRequest
+    
+    if (r > 0) return; // delayed
+    if (r == -ENOENT ||
+        r == -ENOTDIR ||
+        r == -EISDIR) {
+      // error! 
+      dout(10) << " path traverse error " << r << ", replying" << endl;
+      
+      // send error
+      messenger->send_message(new MClientReply(req, r),
+                              MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+      // <HACK>
+      // is this a special debug command?
+      if (refpath.depth() - 1 == trace.size() &&
+	  refpath.last_bit().find(".ceph.") == 0) {
+	CDir *dir = 0;
+	if (trace.empty())
+	  dir = mdcache->get_root()->dir;
+	else
+	  dir = trace[trace.size()-1]->get_inode()->dir;
+
+	dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl;
+
+	if (refpath.last_bit() == ".ceph.hash" &&
+	    refpath.depth() > 1) {
+	  dout(1) << "got explicit hash command " << refpath << endl;
+	  CDir *dir = trace[trace.size()-1]->get_inode()->dir;
+	  if (!dir->is_hashed() &&
+	      !dir->is_hashing() &&
+	      dir->is_auth())
+	    mdcache->migrator->hash_dir(dir);
+	}
+	else if (refpath.last_bit() == ".ceph.commit") {
+	  dout(1) << "got explicit commit command on  " << *dir << endl;
+	  mds->mdstore->commit_dir(dir, 0);
+	}
+      }
+      // </HACK>
+
+
+      delete req;
+      return;
+    }
+    
+    if (trace.size()) 
+      ref = trace[trace.size()-1]->inode;
+    else
+      ref = mdcache->get_root();
+  }
+  
+  dout(10) << "ref is " << *ref << endl;
+  
+  // rename doesn't pin src path (initially)
+  if (req->get_op() == MDS_OP_RENAME) trace.clear();
+
+  // register
+  if (!mdcache->request_start(req, ref, trace))
+    return;
+  
+  // process
+  dispatch_request(req, ref);
+}
+
+
+
+void Server::dispatch_request(Message *m, CInode *ref)
+{
+  MClientRequest *req = 0;
+
+  // MLock or MClientRequest?
+  /* this is a little weird.
+     client requests and mlocks both initial dentry xlocks, path pins, etc.,
+     and thus both make use of the context C_MDS_RetryRequest.
+  */
+  switch (m->get_type()) {
+  case MSG_CLIENT_REQUEST:
+    req = (MClientRequest*)m;
+    break; // continue below!
+
+  case MSG_MDS_LOCK:
+    mds->locker->handle_lock_dn((MLock*)m);
+    return; // done
+
+  default:
+    assert(0);  // shouldn't get here
+  }
+
+  // MClientRequest.
+
+  switch(req->get_op()) {
+    
+    // files
+  case MDS_OP_OPEN:
+    if (req->get_iarg() & O_CREAT) 
+      handle_client_openc(req, ref);
+    else 
+      handle_client_open(req, ref);
+    break;
+  case MDS_OP_TRUNCATE:
+    handle_client_truncate(req, ref);
+    break;
+    /*
+  case MDS_OP_FSYNC:
+    handle_client_fsync(req, ref);
+    break;
+    */
+    /*
+  case MDS_OP_RELEASE:
+    handle_client_release(req, ref);
+    break;
+    */
+
+    // inodes
+  case MDS_OP_STAT:
+  case MDS_OP_LSTAT:
+    handle_client_stat(req, ref);
+    break;
+  case MDS_OP_UTIME:
+    handle_client_utime(req, ref);
+    break;
+  case MDS_OP_CHMOD:
+    handle_client_chmod(req, ref);
+    break;
+  case MDS_OP_CHOWN:
+    handle_client_chown(req, ref);
+    break;
+
+    // namespace
+  case MDS_OP_READDIR:
+    handle_client_readdir(req, ref);
+    break;
+  case MDS_OP_MKNOD:
+    handle_client_mknod(req, ref);
+    break;
+  case MDS_OP_LINK:
+    handle_client_link(req, ref);
+    break;
+  case MDS_OP_UNLINK:
+    handle_client_unlink(req, ref);
+    break;
+  case MDS_OP_RENAME:
+    handle_client_rename(req, ref);
+    break;
+  case MDS_OP_RMDIR:
+    handle_client_unlink(req, ref);
+    break;
+  case MDS_OP_MKDIR:
+    handle_client_mkdir(req, ref);
+    break;
+  case MDS_OP_SYMLINK:
+    handle_client_symlink(req, ref);
+    break;
+
+
+
+  default:
+    dout(1) << " unknown client op " << req->get_op() << endl;
+    assert(0);
+  }
+
+  return;
+}
+
+
+
+
+// STAT
+
+void Server::handle_client_stat(MClientRequest *req,
+				CInode *ref)
+{
+  // do I need file info?
+  int mask = req->get_iarg();
+  if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) {
+    // yes.  do a full stat.
+    if (!mds->locker->inode_file_read_start(ref, req))
+      return;  // syncing
+    mds->locker->inode_file_read_finish(ref);
+  } else {
+    // nope!  easy peasy.
+  }
+  
+  mds->balancer->hit_inode(ref, META_POP_IRD);   
+  
+  // reply
+  dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
+  MClientReply *reply = new MClientReply(req);
+
+  reply_request(req, reply, ref);
+}
+
+
+
+// INODE UPDATES
+
+// utime
+
+void Server::handle_client_utime(MClientRequest *req,
+                              CInode *cur)
+{
+  // write
+  if (!mds->locker->inode_file_write_start(cur, req))
+    return;  // fw or (wait for) sync
+
+  // do update
+  cur->inode.mtime = req->get_targ();
+  cur->inode.atime = req->get_targ2();
+  if (cur->is_auth())
+    cur->mark_dirty();
+
+  mds->locker->inode_file_write_finish(cur);
+  
+  mds->balancer->hit_inode(cur, META_POP_IWR);   
+
+  // init reply
+  MClientReply *reply = new MClientReply(req, 0);
+  reply->set_result(0);
+
+  // commit
+  commit_request(req, reply, cur,
+                 new EInodeUpdate(cur));
+}
+
+                           
+
+// HARD
+
+// chmod
+
+void Server::handle_client_chmod(MClientRequest *req,
+                              CInode *cur)
+{
+  // write
+  if (!mds->locker->inode_hard_write_start(cur, req))
+    return;  // fw or (wait for) lock
+
+ 
+  // check permissions
+  
+  // do update
+  int mode = req->get_iarg();
+  cur->inode.mode &= ~04777;
+  cur->inode.mode |= (mode & 04777);
+  cur->mark_dirty();
+
+  mds->locker->inode_hard_write_finish(cur);
+
+  mds->balancer->hit_inode(cur, META_POP_IWR);   
+
+  // start reply
+  MClientReply *reply = new MClientReply(req, 0);
+
+  // commit
+  commit_request(req, reply, cur,
+                 new EInodeUpdate(cur));
+}
+
+// chown
+
+void Server::handle_client_chown(MClientRequest *req,
+                              CInode *cur)
+{
+  // write
+  if (!mds->locker->inode_hard_write_start(cur, req))
+    return;  // fw or (wait for) lock
+
+  // check permissions
+
+  // do update
+  int uid = req->get_iarg();
+  int gid = req->get_iarg2();
+  cur->inode.uid = uid;
+  cur->inode.gid = gid;
+  cur->mark_dirty();
+
+  mds->locker->inode_hard_write_finish(cur);
+
+  mds->balancer->hit_inode(cur, META_POP_IWR);   
+
+  // start reply
+  MClientReply *reply = new MClientReply(req, 0);
+
+  // commit
+  commit_request(req, reply, cur,
+                 new EInodeUpdate(cur));
+}
+
+
+
+bool Server::try_open_dir(CInode *in, MClientRequest *req)
+{
+  if (!in->dir && in->is_frozen_dir()) {
+    // doh!
+    dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl;
+    assert(in->get_parent_dir());
+    in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
+                                     new C_MDS_RetryRequest(mds, req, in));
+    return false;
+  }
+
+  in->get_or_open_dir(mds);
+  return true;
+}
+
+
+// DIRECTORY and NAMESPACE OPS
+
+// READDIR
+
+int Server::encode_dir_contents(CDir *dir, 
+                             list<InodeStat*>& inls,
+                             list<string>& dnls)
+{
+  int numfiles = 0;
+
+  for (CDir_map_t::iterator it = dir->begin(); 
+       it != dir->end(); 
+       it++) {
+    CDentry *dn = it->second;
+    
+    // hashed?
+    if (dir->is_hashed() &&
+        whoami != mds->hash_dentry( dir->ino(), it->first ))
+      continue;
+    
+    // is dentry readable?
+    if (dn->is_xlocked()) {
+      // ***** FIXME *****
+      // ?
+      dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl;
+    }
+    
+    CInode *in = dn->inode;
+    if (!in) continue;  // null dentry?
+    
+    dout(12) << "including inode " << *in << endl;
+
+    // add this item
+    // note: InodeStat makes note of whether inode data is readable.
+    dnls.push_back( it->first );
+    inls.push_back( new InodeStat(in, whoami) );
+    numfiles++;
+  }
+  return numfiles;
+}
+
+
+/*
+ * note: this is pretty sloppy, but should work just fine i think...
+ */
+void Server::handle_hash_readdir(MHashReaddir *m)
+{
+  CInode *cur = mdcache->get_inode(m->get_ino());
+  assert(cur);
+
+  if (!cur->dir ||
+      !cur->dir->is_hashed()) {
+    assert(0);
+    dout(7) << "handle_hash_readdir don't have dir open, or not hashed.  giving up!" << endl;
+    delete m;
+    return;    
+  }
+  CDir *dir = cur->dir;
+  assert(dir);
+  assert(dir->is_hashed());
+
+  // complete?
+  if (!dir->is_complete()) {
+    dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl;
+    mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m));
+    return;
+  }  
+  
+  // get content
+  list<InodeStat*> inls;
+  list<string> dnls;
+  int num = encode_dir_contents(dir, inls, dnls);
+  
+  // sent it back!
+  messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num),
+                          m->get_source(), m->get_source_inst(), MDS_PORT_CACHE);
+}
+
+
+void Server::handle_hash_readdir_reply(MHashReaddirReply *m)
+{
+  CInode *cur = mdcache->get_inode(m->get_ino());
+  assert(cur);
+
+  if (!cur->dir ||
+      !cur->dir->is_hashed()) {
+    assert(0);
+    dout(7) << "handle_hash_readdir don't have dir open, or not hashed.  giving up!" << endl;
+    delete m;
+    return;    
+  }
+  CDir *dir = cur->dir;
+  assert(dir);
+  assert(dir->is_hashed());
+  
+  // move items to hashed_readdir gather
+  int from = MSG_ADDR_NUM(m->get_source());
+  assert(dir->hashed_readdir.count(from) == 0);
+  dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(),
+                                         m->get_in());
+  dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(),
+                                          m->get_dn());
+  delete m;
+
+  // gather finished?
+  if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) {
+    dout(7) << "still waiting for more hashed readdir bits" << endl;
+    return;
+  }
+  
+  dout(7) << "got last bit!  finishing waiters" << endl;
+  
+  // do these finishers.  they'll copy the results.
+  list<Context*> finished;
+  dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished);
+  finish_contexts(finished);
+  
+  // now discard these results
+  for (map<int, pair< list<InodeStat*>, list<string> > >::iterator it = dir->hashed_readdir.begin();
+       it != dir->hashed_readdir.end();
+       it++) {
+    for (list<InodeStat*>::iterator ci = it->second.first.begin();
+         ci != it->second.first.end();
+         ci++) 
+      delete *ci;
+  }
+  dir->hashed_readdir.clear();
+  
+  // unpin dir (we're done!)
+  dir->auth_unpin();
+  
+  // trigger any waiters for next hashed readdir cycle
+  dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue);
+}
+
+
+class C_MDS_HashReaddir : public Context {
+  Server *server;
+  MClientRequest *req;
+  CDir *dir;
+public:
+  C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) {
+    this->server = server;
+    this->req = req;
+    this->dir = dir;
+  }
+  void finish(int r) {
+    server->finish_hash_readdir(req, dir);
+  }
+};
+
+void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) 
+{
+  dout(7) << "finish_hash_readdir on " << *dir << endl;
+
+  assert(dir->is_hashed());
+  assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds());
+
+  // reply!
+  MClientReply *reply = new MClientReply(req);
+  reply->set_result(0);
+
+  for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+    reply->copy_dir_items(dir->hashed_readdir[i].first,
+                          dir->hashed_readdir[i].second);
+  }
+
+  // ok!
+  reply_request(req, reply, dir->inode);
+}
+
+
+void Server::handle_client_readdir(MClientRequest *req,
+                                CInode *cur)
+{
+  // it's a directory, right?
+  if (!cur->is_dir()) {
+    // not a dir
+    dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl;
+    reply_request(req, -ENOTDIR);
+    return;
+  }
+
+  // auth?
+  if (!cur->dir_is_auth()) {
+    int dirauth = cur->authority();
+    if (cur->dir)
+      dirauth = cur->dir->authority();
+    assert(dirauth >= 0);
+    assert(dirauth != whoami);
+    
+    // forward to authority
+    dout(10) << " forwarding readdir to authority " << dirauth << endl;
+    mdcache->request_forward(req, dirauth);
+    return;
+  }
+  
+  if (!try_open_dir(cur, req))
+    return;
+  assert(cur->dir->is_auth());
+
+  // unhashing?  wait!
+  if (cur->dir->is_hashed() &&
+      cur->dir->is_unhashing()) {
+    dout(10) << "unhashing, waiting" << endl;
+    cur->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+                         new C_MDS_RetryRequest(mds, req, cur));
+    return;
+  }
+
+  // check perm
+  if (!mds->locker->inode_hard_read_start(cur,req))
+    return;
+  mds->locker->inode_hard_read_finish(cur);
+
+  CDir *dir = cur->dir;
+  assert(dir);
+
+  if (!dir->is_complete()) {
+    // fetch
+    dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl;
+    mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur));
+    return;
+  }
+
+  if (dir->is_hashed()) {
+    // HASHED
+    dout(7) << "hashed dir" << endl;
+    if (!dir->can_auth_pin()) {
+      dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl;
+      dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur));
+      return;
+    }
+
+    if (!dir->hashed_readdir.empty()) {
+      dout(7) << "another readdir gather in progres, waiting" << endl;
+      dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur));
+      return;
+    }
+
+    // start new readdir gather
+    dout(7) << "staring new hashed readdir gather" << endl;
+
+    // pin auth for process!
+    dir->auth_pin();
+    
+    // get local bits
+    encode_dir_contents(cur->dir, 
+                        dir->hashed_readdir[whoami].first,
+                        dir->hashed_readdir[whoami].second);
+    
+    // request other bits
+    for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+      if (i == whoami) continue;
+      mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER);
+    }
+
+    // wait
+    dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, 
+                    new C_MDS_HashReaddir(this, req, dir));
+  } else {
+    // NON-HASHED
+    // build dir contents
+    list<InodeStat*> inls;
+    list<string> dnls;
+    int numfiles = encode_dir_contents(cur->dir, inls, dnls);
+    
+    // . too
+    dnls.push_back(".");
+    inls.push_back(new InodeStat(cur, whoami));
+    ++numfiles;
+
+    // yay, reply
+    MClientReply *reply = new MClientReply(req);
+    reply->take_dir_items(inls, dnls, numfiles);
+    
+    dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl;
+    reply->set_result(0);
+    
+    //balancer->hit_dir(cur->dir);
+    
+    // reply
+    reply_request(req, reply, cur);
+  }
+}
+
+
+// MKNOD
+
+void Server::handle_client_mknod(MClientRequest *req, CInode *ref)
+{
+  // make dentry and inode, link.  
+  CInode *newi = mknod(req, ref);
+  if (!newi) return;
+
+  // it's a file!
+  newi->inode.mode = req->get_iarg();
+  newi->inode.mode &= ~INODE_TYPE_MASK;
+  newi->inode.mode |= INODE_MODE_FILE;
+  
+  mds->balancer->hit_inode(newi, META_POP_IWR);
+
+  // commit
+  commit_request(req, new MClientReply(req, 0), ref,
+		 new EMknod(newi));
+}
+
+// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical.
+
+CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist) 
+{
+  dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl;
+
+  // get containing directory (without last bit)
+  filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1);
+  string name = req->get_filepath().last_bit();
+  
+  // did we get to parent?
+  dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl;
+
+  // make sure parent is a dir?
+  if (!diri->is_dir()) {
+    dout(7) << "not a dir" << endl;
+    reply_request(req, -ENOTDIR);
+    return 0;
+  }
+
+  // am i not open, not auth?
+  if (!diri->dir && !diri->is_auth()) {
+    int dirauth = diri->authority();
+    dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+    mdcache->request_forward(req, dirauth);
+    return 0;
+  }
+  
+  if (!try_open_dir(diri, req)) return 0;
+  CDir *dir = diri->dir;
+  
+  // make sure it's my dentry
+  int dnauth = dir->dentry_authority(name);  
+  if (dnauth != whoami) {
+    // fw
+    
+    dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl;
+    mdcache->request_forward(req, dnauth);
+    return 0;
+  }
+  // ok, done passing buck.
+
+
+  // frozen?
+  if (dir->is_frozen()) {
+    dout(7) << "dir is frozen " << *dir << endl;
+    dir->add_waiter(CDIR_WAIT_UNFREEZE,
+                    new C_MDS_RetryRequest(mds, req, diri));
+    return 0;
+  }
+
+  // make sure name doesn't already exist
+  CDentry *dn = dir->lookup(name);
+  if (dn) {
+    if (!dn->can_read(req)) {
+      dout(10) << "waiting on (existing!) dentry " << *dn << endl;
+      dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri));
+      return 0;
+    }
+
+    if (!dn->is_null()) {
+      // name already exists
+      if (okexist) {
+        dout(10) << "dentry " << name << " exists in " << *dir << endl;
+        return dn->inode;
+      } else {
+        dout(10) << "dentry " << name << " exists in " << *dir << endl;
+        reply_request(req, -EEXIST);
+        return 0;
+      }
+    }
+  }
+
+  // make sure dir is complete
+  if (!dir->is_complete()) {
+    dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl;
+    mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri));
+    return 0;
+  }
+
+  // create!
+  CInode *newi = mdcache->create_inode();
+  newi->inode.uid = req->get_caller_uid();
+  newi->inode.gid = req->get_caller_gid();
+  newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime();   // now
+
+  // link
+  if (!dn) 
+    dn = dir->add_dentry(name, newi);
+  else
+    dir->link_inode(dn, newi);
+  
+  // bump modify pop
+  mds->balancer->hit_dir(dir, META_POP_DWR);
+  
+  // mark dirty
+  dn->mark_dirty();
+  newi->mark_dirty();
+  
+  // journal it
+  //mdlog->submit_entry(new EMknod(newi));
+
+  // ok!
+  return newi;
+}
+
+
+// LINK
+
+class C_MDS_LinkTraverse : public Context {
+  Server *server;
+  MClientRequest *req;
+  CInode *ref;
+public:
+  vector<CDentry*> trace;
+  C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) {
+    this->server = server;
+    this->req = req;
+    this->ref = ref;
+  }
+  void finish(int r) {
+    server->handle_client_link_2(r, req, ref, trace);
+  }
+};
+
+void Server::handle_client_link(MClientRequest *req, CInode *ref)
+{
+  // figure out name
+  string dname = req->get_filepath().last_bit();
+  dout(7) << "dname is " << dname << endl;
+  
+  // make sure parent is a dir?
+  if (!ref->is_dir()) {
+    dout(7) << "not a dir " << *ref << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+
+  // am i not open, not auth?
+  if (!ref->dir && !ref->is_auth()) {
+    int dirauth = ref->authority();
+    dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+    mdcache->request_forward(req, dirauth);
+    return;
+  }
+  
+  if (!try_open_dir(ref, req)) return;
+  CDir *dir = ref->dir;
+  dout(7) << "handle_client_link dir is " << *dir << endl;
+  
+
+
+  // make sure it's my dentry
+  int dauth = dir->dentry_authority(dname);  
+  if (dauth != whoami) {
+    // fw
+    dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl;
+    mdcache->request_forward(req, dauth);
+    return;
+  }
+  // ok, done passing buck.
+  
+
+  // exists?
+  CDentry *dn = dir->lookup(dname);
+  if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+    dout(7) << "handle_client_link dn exists " << *dn << endl;
+    reply_request(req, -EEXIST);
+    return;
+  }
+
+  // keep src dir in memory
+  mdcache->request_pin_dir(req, dir);
+
+  // discover link target
+  filepath target = req->get_sarg();
+
+  dout(7) << "handle_client_link discovering target " << target << endl;
+
+  C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref);
+  Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+  
+  mdcache->path_traverse(target, onfinish->trace, false,
+                         req, ondelay,
+                         MDS_TRAVERSE_DISCOVER,  //XLOCK, 
+                         onfinish);
+}
+
+
+class C_MDS_RemoteLink : public Context {
+  Server *server;
+  MClientRequest *req;
+  CInode *ref;
+  CDentry *dn;
+  CInode *targeti;
+public:
+  C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) {
+    this->server = server;
+    this->req = req;
+    this->ref = ref;
+    this->dn = dn;
+    this->targeti = targeti;
+  }
+  void finish(int r) {
+    if (r > 0) { // success
+      // yay
+      server->handle_client_link_finish(req, ref, dn, targeti);
+    } 
+    else if (r == 0) {
+      // huh?  retry!
+      assert(0);
+      server->dispatch_request(req, ref);      
+    } else {
+      // link failed
+      server->reply_request(req, r);
+    }
+  }
+};
+
+void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace)
+{
+  // target dne?
+  if (r < 0) {
+    dout(7) << "target " << req->get_sarg() << " dne" << endl;
+    reply_request(req, r);
+    return;
+  }
+  assert(r == 0);
+
+  CInode *targeti = mdcache->get_root();
+  if (trace.size()) targeti = trace[trace.size()-1]->inode;
+  assert(targeti);
+
+  // dir?
+  dout(7) << "target is " << *targeti << endl;
+  if (targeti->is_dir()) {
+    dout(7) << "target is a dir, failing" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+  
+  // keep target inode in memory
+  mdcache->request_pin_inode(req, targeti);
+
+  dout(7) << "dir is " << *ref << endl;
+
+  // xlock the dentry
+  CDir *dir = ref->dir;
+  assert(dir);
+  
+  string dname = req->get_filepath().last_bit();
+  int dauth = dir->dentry_authority(dname);
+  if (whoami != dauth) {
+    // ugh, exported out from under us
+    dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl;
+    mdcache->request_forward(req, dauth);
+    return;
+  }
+  
+  CDentry *dn = dir->lookup(dname);
+  if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+    dout(7) << "handle_client_link dn exists " << *dn << endl;
+    reply_request(req, -EEXIST);
+    return;
+  }
+
+  if (!dn) dn = dir->add_dentry(dname);
+  
+  if (!dn->is_xlockedbyme(req)) {
+    if (!mds->locker->dentry_xlock_start(dn, req, ref)) {
+      if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+      return;
+    }
+  }
+
+  
+  // ok xlocked!
+  if (targeti->is_auth()) {
+    // mine
+    if (targeti->is_anchored()) {
+      dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl;
+    } else {
+      assert(targeti->inode.nlink == 1);
+      dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl;
+      
+      mdcache->anchor_inode(targeti,
+                            new C_MDS_RetryRequest(mds, req, ref));
+      return;
+    }
+
+    // ok, inc link!
+    targeti->inode.nlink++;
+    dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl;
+    targeti->mark_dirty();
+    
+  } else {
+    // remote: send nlink++ request, wait
+    dout(7) << "target is remote, sending InodeLink" << endl;
+    mds->send_message_mds(new MInodeLink(targeti->ino(), whoami), targeti->authority(), MDS_PORT_CACHE);
+    
+    // wait
+    targeti->add_waiter(CINODE_WAIT_LINK,
+                        new C_MDS_RemoteLink(this, req, ref, dn, targeti));
+    return;
+  }
+
+  handle_client_link_finish(req, ref, dn, targeti);
+}
+
+void Server::handle_client_link_finish(MClientRequest *req, CInode *ref,
+                                    CDentry *dn, CInode *targeti)
+{
+  // create remote link
+  dn->dir->link_inode(dn, targeti->ino());
+  dn->link_remote( targeti );   // since we have it
+  dn->mark_dirty();
+  
+  mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+  // done!
+  commit_request(req, new MClientReply(req, 0), ref,
+                 0);          // FIXME i should log something
+}
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MClientRequest *req, 
+                               CInode *diri)
+{
+  // rmdir or unlink
+  bool rmdir = false;
+  if (req->get_op() == MDS_OP_RMDIR) rmdir = true;
+  
+  // find it
+  if (req->get_filepath().depth() == 0) {
+    dout(7) << "can't rmdir root" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+  string name = req->get_filepath().last_bit();
+  
+  // make sure parent is a dir?
+  if (!diri->is_dir()) {
+    dout(7) << "not a dir" << endl;
+    reply_request(req, -ENOTDIR);
+    return;
+  }
+
+  // am i not open, not auth?
+  if (!diri->dir && !diri->is_auth()) {
+    int dirauth = diri->authority();
+    dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+    mdcache->request_forward(req, dirauth);
+    return;
+  }
+  
+  if (!try_open_dir(diri, req)) return;
+  CDir *dir = diri->dir;
+  int dnauth = dir->dentry_authority(name);  
+
+  // does it exist?
+  CDentry *dn = dir->lookup(name);
+  if (!dn) {
+    if (dnauth == whoami) {
+      dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl;
+      reply_request(req, -ENOENT);
+    } else {
+      // send to authority!
+      dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl;
+      mdcache->request_forward(req, dnauth);
+    }
+    return;
+  }
+
+  // have it.  locked?
+  if (!dn->can_read(req)) {
+    dout(10) << " waiting on " << *dn << endl;
+    dir->add_waiter(CDIR_WAIT_DNREAD,
+                    name,
+                    new C_MDS_RetryRequest(mds, req, diri));
+    return;
+  }
+
+  // null?
+  if (dn->is_null()) {
+    dout(10) << "unlink on null dn " << *dn << endl;
+    reply_request(req, -ENOENT);
+    return;
+  }
+
+  // ok!
+  CInode *in = dn->inode;
+  assert(in);
+  if (rmdir) {
+    dout(7) << "handle_client_rmdir on dir " << *in << endl;
+  } else {
+    dout(7) << "handle_client_unlink on non-dir " << *in << endl;
+  }
+
+  // dir stuff 
+  if (in->is_dir()) {
+    if (rmdir) {
+      // rmdir
+      
+      // open dir?
+      if (in->is_auth() && !in->dir) {
+        if (!try_open_dir(in, req)) return;
+      }
+
+      // not dir auth?  (or not open, which implies the same!)
+      if (!in->dir) {
+        dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl;
+        mdcache->request_forward(req, dnauth);
+        return;
+      }
+      if (!in->dir->is_auth()) {
+        int dirauth = in->dir->authority();
+        dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl;
+        mdcache->request_forward(req, dirauth);
+        return;
+      }
+
+      assert(in->dir);
+      assert(in->dir->is_auth());
+
+      // dir size check on dir auth (but not necessarily dentry auth)?
+
+      // should be empty
+      if (in->dir->get_size() == 0 && !in->dir->is_complete()) {
+        dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl;
+        mds->mdstore->fetch_dir(in->dir, 
+				new C_MDS_RetryRequest(mds, req, diri));
+        return;
+      }
+      if (in->dir->get_size() > 0) {
+        dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl;
+        reply_request(req, -ENOTEMPTY);
+        return;
+      }
+        
+      dout(7) << "handle_client_rmdir dir is empty!" << endl;
+
+      // export sanity check
+      if (!in->is_auth()) {
+        // i should be exporting this now/soon, since the dir is empty.
+        dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl;
+        if (!in->dir->is_freezing() && in->dir->is_frozen()) {
+          assert(in->dir->is_import());
+          mdcache->migrator->export_empty_import(in->dir);          
+        } else {
+          dout(7) << "apparently already exporting" << endl;
+        }
+        in->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+                            new C_MDS_RetryRequest(mds, req, diri));
+        return;
+      }
+
+    } else {
+      // unlink
+      dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl;
+      reply_request(req, -EISDIR);
+      return;
+    }
+  } else {
+    if (rmdir) {
+      // unlink
+      dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl;
+      reply_request(req, -ENOTDIR);
+      return;
+    }
+  }
+
+  // am i dentry auth?
+  if (dnauth != whoami) {
+    // not auth; forward!
+    dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl;
+    mdcache->request_forward(req, dnauth);
+    return;
+  }
+    
+  dout(7) << "handle_client_unlink/rmdir on " << *in << endl;
+  
+  // xlock dentry
+  if (!mds->locker->dentry_xlock_start(dn, req, diri))
+    return;
+
+  // is this a remote link?
+  if (dn->is_remote() && !dn->inode) {
+    CInode *in = mdcache->get_inode(dn->get_remote_ino());
+    if (in) {
+      dn->link_remote(in);
+    } else {
+      // open inode
+      dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl;
+      mdcache->open_remote_ino(dn->get_remote_ino(), req, 
+                               new C_MDS_RetryRequest(mds, req, diri));
+      return;
+    }
+  }
+
+    
+  mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+  // it's locked, unlink!
+  MClientReply *reply = new MClientReply(req,0);
+  mdcache->dentry_unlink(dn,
+                         new C_MDS_CommitRequest(this, req, reply, diri,
+                                                 new EInodeUpdate(diri))); // FIXME WRONG EVENT
+  return;
+}
+
+
+
+
+
+
+// RENAME
+
+class C_MDS_RenameTraverseDst : public Context {
+  Server *server;
+  MClientRequest *req;
+  CInode *ref;
+  CInode *srcdiri;
+  CDir *srcdir;
+  CDentry *srcdn;
+  filepath destpath;
+public:
+  vector<CDentry*> trace;
+  
+  C_MDS_RenameTraverseDst(Server *server,
+                          MClientRequest *req, 
+                          CInode *ref,
+                          CInode *srcdiri,
+                          CDir *srcdir,
+                          CDentry *srcdn,
+                          filepath& destpath) {
+    this->server = server;
+    this->req = req;
+    this->ref = ref;
+    this->srcdiri = srcdiri;
+    this->srcdir = srcdir;
+    this->srcdn = srcdn;
+    this->destpath = destpath;
+  }
+  void finish(int r) {
+    server->handle_client_rename_2(req, ref,
+				   srcdiri, srcdir, srcdn, destpath,
+				   trace, r);
+  }
+};
+
+
+/*
+  
+  weirdness iwith rename:
+    - ref inode is what was originally srcdiri, but that may change by the tiem
+      the rename actually happens.  for all practical purpose, ref is useless except
+      for C_MDS_RetryRequest
+
+ */
+void Server::handle_client_rename(MClientRequest *req,
+                               CInode *ref)
+{
+  dout(7) << "handle_client_rename on " << *req << endl;
+
+  // sanity checks
+  if (req->get_filepath().depth() == 0) {
+    dout(7) << "can't rename root" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+  // mv a/b a/b/c  -- meaningless
+  if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 &&
+      req->get_sarg().c_str()[ req->get_path().length() ] == '/') {
+    dout(7) << "can't rename to underneath myself" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+
+  // mv blah blah  -- also meaningless
+  if (req->get_sarg() == req->get_path()) {
+    dout(7) << "can't rename something to itself (or into itself)" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+  
+  // traverse to source
+  /*
+    this is abnoraml, just for rename.  since we don't pin source path 
+    (because we don't want to screw up the lock ordering) the ref inode 
+    (normally/initially srcdiri) may move, and this may fail.
+ -> so, re-traverse path.  and make sure we request_finish in the case of a forward!
+   */
+  filepath refpath = req->get_filepath();
+  string srcname = refpath.last_bit();
+  refpath = refpath.prefixpath(refpath.depth()-1);
+
+  dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl;
+  vector<CDentry*> trace;
+  int r = mdcache->path_traverse(refpath, trace, true,
+                                 req, new C_MDS_RetryRequest(mds, req, ref),
+                                 MDS_TRAVERSE_FORWARD);
+  if (r == 2) {
+    dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl;
+    dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl;
+    mdcache->request_cleanup(req);  // not _finish (deletes) or _forward (path_traverse did that)
+    return;
+  }
+  if (r > 0) return;
+  if (r < 0) {   // dne or something.  got renamed out from under us, probably!
+    dout(7) << "traverse r=" << r << endl;
+    reply_request(req, r);
+    return;
+  }
+  
+  CInode *srcdiri;
+  if (trace.size()) 
+    srcdiri = trace[trace.size()-1]->inode;
+  else
+    srcdiri = mdcache->get_root();
+
+  dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl;
+
+  dout(7) << "handle_client_rename srcname is " << srcname << endl;
+
+  // make sure parent is a dir?
+  if (!srcdiri->is_dir()) {
+    dout(7) << "srcdiri not a dir " << *srcdiri << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+
+  // am i not open, not auth?
+  if (!srcdiri->dir && !srcdiri->is_auth()) {
+    int dirauth = srcdiri->authority();
+    dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+    mdcache->request_forward(req, dirauth);
+    return;
+  }
+  
+  if (!try_open_dir(srcdiri, req)) return;
+  CDir *srcdir = srcdiri->dir;
+  dout(7) << "handle_client_rename srcdir is " << *srcdir << endl;
+  
+  // make sure it's my dentry
+  int srcauth = srcdir->dentry_authority(srcname);  
+  if (srcauth != whoami) {
+    // fw
+    dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl;
+    mdcache->request_forward(req, srcauth);
+    return;
+  }
+  // ok, done passing buck.
+
+  // src dentry
+  CDentry *srcdn = srcdir->lookup(srcname);
+
+  // xlocked?
+  if (srcdn && !srcdn->can_read(req)) {
+    dout(10) << " waiting on " << *srcdn << endl;
+    srcdir->add_waiter(CDIR_WAIT_DNREAD,
+                       srcname,
+                       new C_MDS_RetryRequest(mds, req, srcdiri));
+    return;
+  }
+  
+  if ((srcdn && !srcdn->inode) ||
+      (!srcdn && srcdir->is_complete())) {
+    dout(10) << "handle_client_rename src dne " << endl;
+    reply_request(req, -EEXIST);
+    return;
+  }
+  
+  if (!srcdn && !srcdir->is_complete()) {
+    dout(10) << "readding incomplete dir" << endl;
+    mds->mdstore->fetch_dir(srcdir,
+			    new C_MDS_RetryRequest(mds, req, srcdiri));
+    return;
+  }
+  assert(srcdn && srcdn->inode);
+
+
+  dout(10) << "handle_client_rename srcdn is " << *srcdn << endl;
+  dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl;
+
+  // pin src in cache (so it won't expire)
+  mdcache->request_pin_inode(req, srcdn->inode);
+  
+  // find the destination, normalize
+  // discover, etc. on the way... just get it on the local node.
+  filepath destpath = req->get_sarg();   
+
+  C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath);
+  Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+  
+  /*
+   * use DISCOVERXLOCK mode:
+   *   the dest may not exist, and may be xlocked from a remote host
+   *   we want to succeed if we find the xlocked dentry
+   * ??
+   */
+  mdcache->path_traverse(destpath, onfinish->trace, false,
+                         req, ondelay,
+                         MDS_TRAVERSE_DISCOVER,  //XLOCK, 
+                         onfinish);
+}
+
+void Server::handle_client_rename_2(MClientRequest *req,
+                                 CInode *ref,
+                                 CInode *srcdiri,
+                                 CDir *srcdir,
+                                 CDentry *srcdn,
+                                 filepath& destpath,
+                                 vector<CDentry*>& trace,
+                                 int r)
+{
+  dout(7) << "handle_client_rename_2 on " << *req << endl;
+  dout(12) << " r = " << r << " trace depth " << trace.size() << "  destpath depth " << destpath.depth() << endl;
+
+  CInode *srci = srcdn->inode;
+  assert(srci);
+  CDir*  destdir = 0;
+  string destname;
+  
+  // what is the dest?  (dir or file or complete filename)
+  // note: trace includes root, destpath doesn't (include leading /)
+  if (trace.size() && trace[trace.size()-1]->inode == 0) {
+    dout(10) << "dropping null dentry from tail of trace" << endl;
+    trace.pop_back();    // drop it!
+  }
+  
+  CInode *d;
+  if (trace.size()) 
+    d = trace[trace.size()-1]->inode;
+  else
+    d = mdcache->get_root();
+  assert(d);
+  dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl;
+  
+  // make sure i can open the dir?
+  if (d->is_dir() && !d->dir_is_auth() && !d->dir) {
+    // discover it
+    mdcache->open_remote_dir(d,
+                             new C_MDS_RetryRequest(mds, req, ref));
+    return;
+  }
+
+  if (trace.size() == destpath.depth()) {
+    if (d->is_dir()) {
+      // mv /some/thing /to/some/dir 
+      if (!try_open_dir(d, req)) return;
+      destdir = d->dir;                           // /to/some/dir
+      destname = req->get_filepath().last_bit();  // thing
+      destpath.add_dentry(destname);
+    } else {
+      // mv /some/thing /to/some/existing_filename
+      destdir = trace[trace.size()-1]->dir;       // /to/some
+      destname = destpath.last_bit();             // existing_filename
+    }
+  }
+  else if (trace.size() == destpath.depth()-1) {
+    if (d->is_dir()) {
+      // mv /some/thing /to/some/place_that_maybe_dne     (we might be replica)
+      if (!try_open_dir(d, req)) return;
+      destdir = d->dir;                  // /to/some
+      destname = destpath.last_bit();    // place_that_MAYBE_dne
+    } else {
+      dout(7) << "dest dne" << endl;
+      reply_request(req, -EINVAL);
+      return;
+    }
+  }
+  else {
+    assert(trace.size() < destpath.depth()-1);
+    // check traverse return value
+    if (r > 0) {
+      return;  // discover, readdir, etc.
+    }
+
+    // ??
+    assert(r < 0 || trace.size() == 0);  // musta been an error
+
+    // error out
+    dout(7) << " rename dest " << destpath << " dne" << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+
+  string srcpath = req->get_path();
+  dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl;
+  dout(10) << "handle_client_rename_2 destpath " << destpath << endl;
+
+  // src == dest?
+  if (srcdn->get_dir() == destdir && srcdn->name == destname) {
+    dout(7) << "rename src=dest, same file " << endl;
+    reply_request(req, -EINVAL);
+    return;
+  }
+
+  // does destination exist?  (is this an overwrite?)
+  CDentry *destdn = destdir->lookup(destname);
+  CInode  *oldin = 0;
+  if (destdn) {
+    oldin = destdn->get_inode();
+    
+    if (oldin) {
+      // make sure it's also a file!
+      // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir.
+      if (oldin->is_dir()) {
+        // fail!
+        dout(7) << "dest exists and is dir" << endl;
+        reply_request(req, -EISDIR);
+        return;
+      }
+
+      if (srcdn->inode->is_dir() &&
+          !oldin->is_dir()) {
+        dout(7) << "cannot overwrite non-directory with directory" << endl;
+        reply_request(req, -EISDIR);
+        return;
+      }
+    }
+
+    dout(7) << "dest exists " << *destdn << endl;
+    if (destdn->get_inode()) {
+      dout(7) << "destino is " << *destdn->get_inode() << endl;
+    } else {
+      dout(7) << "dest dn is a NULL stub" << endl;
+    }
+  } else {
+    dout(7) << "dest dn dne (yet)" << endl;
+  }
+  
+
+  // local or remote?
+  int srcauth = srcdir->dentry_authority(srcdn->name);
+  int destauth = destdir->dentry_authority(destname);
+  dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl;
+  
+  // 
+  if (srcauth != whoami || 
+      destauth != whoami) {
+    dout(7) << "rename has remote dest " << destauth << endl;
+    dout(7) << "FOREIGN RENAME" << endl;
+    
+    // punt?
+    if (false && srcdn->inode->is_dir()) {
+      reply_request(req, -EINVAL);  
+      return; 
+    }
+
+  } else {
+    dout(7) << "rename is local" << endl;
+  }
+
+  handle_client_rename_local(req, ref,
+                             srcpath, srcdiri, srcdn, 
+                             destpath.get_path(), destdir, destdn, destname);
+  return;
+}
+
+
+
+
+void Server::handle_client_rename_local(MClientRequest *req,
+                                     CInode *ref,
+                                     string& srcpath,
+                                     CInode *srcdiri,
+                                     CDentry *srcdn,
+                                     string& destpath,
+                                     CDir *destdir,
+                                     CDentry *destdn,
+                                     string& destname)
+{
+  //bool everybody = false;
+  //if (true || srcdn->inode->is_dir()) {
+    /* overkill warning: lock w/ everyone for simplicity.  FIXME someday!  along with the foreign rename crap!
+       i could limit this to cases where something beneath me is exported.
+       could possibly limit the list.    (maybe.)
+       Underlying constraint is that, regardless of the order i do the xlocks, and whatever
+       imports/exports might happen in the process, the destdir _must_ exist on any node
+       importing something beneath me when rename finishes, or else mayhem ensues when
+       their import is dangling in the cache.
+     */
+    /*
+      having made a proper mess of this on the first pass, here is my plan:
+      
+      - xlocks of src, dest are done in lex order
+      - xlock is optional.. if you have the dentry, lock it, if not, don't.
+      - if you discover an xlocked dentry, you get the xlock.
+
+      possible trouble:
+      - you have an import beneath the source, and don't have the dest dir.
+        - when the actual rename happens, you discover the dest
+        - actually, do this on any open dir, so we don't detach whole swaths
+          of our cache.
+      
+      notes:
+      - xlocks are initiated from authority, as are discover_replies, so replicas are 
+        guaranteed to either not have dentry, or to have it xlocked. 
+      - 
+      - foreign xlocks are eventually unraveled by the initiator on success or failure.
+
+      todo to make this work:
+      - hose bool everybody param crap
+      /- make handle_lock_dn not discover, clean up cases
+      /- put dest path in MRenameNotify
+      /- make rename_notify discover if its a dir
+      /  - this will catch nested imports too, obviously
+      /- notify goes to merged list on local rename
+      /- notify goes to everybody on a foreign rename 
+      /- handle_notify needs to gracefully ignore spurious notifies
+    */
+  //dout(7) << "handle_client_rename_local: overkill?  doing xlocks with _all_ nodes" << endl;
+  //everybody = true;
+  //}
+
+  bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == whoami;
+  bool destlocal = destdir->dentry_authority(destname) == whoami;
+
+  dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl;
+  if (destdn) {
+    dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl;
+  } else {
+    dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl;
+  }
+
+  /* lock source and dest dentries, in lexicographic order.
+   */
+  bool dosrc = srcpath < destpath;
+  for (int i=0; i<2; i++) {
+    if (dosrc) {
+
+      // src
+      if (srclocal) {
+        if (!srcdn->is_xlockedbyme(req) &&
+            !mds->locker->dentry_xlock_start(srcdn, req, ref))
+          return;  
+      } else {
+        if (!srcdn || srcdn->xlockedby != req) {
+          mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref));
+          return;
+        }
+      }
+      dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl;
+      
+    } else {
+
+      if (destlocal) {
+        // dest
+        if (!destdn) destdn = destdir->add_dentry(destname);
+        if (!destdn->is_xlockedbyme(req) &&
+            !mds->locker->dentry_xlock_start(destdn, req, ref)) {
+          if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn);
+          return;
+        }
+      } else {
+        if (!destdn || destdn->xlockedby != req) {
+          /* NOTE: require that my xlocked item be a leaf/file, NOT a dir.  in case
+           * my traverse and determination of dest vs dest/srcfilename was out of date.
+           */
+          mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref));
+          return;
+        }
+      }
+      dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl;
+
+    }
+    
+    dosrc = !dosrc;
+  }
+
+  
+  // final check: verify if dest exists that src is a file
+
+  // FIXME: is this necessary?
+
+  if (destdn->inode) {
+    if (destdn->inode->is_dir()) {
+      dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl;
+      assert(0);
+      reply_request(req, -EINVAL);  
+      return; 
+    }
+    if (srcdn->inode->is_dir()) {
+      dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl;
+      assert(0);
+      reply_request(req, -EINVAL);  
+      return; 
+    }
+  } else {
+    // if destdn->inode is null, then we know it's a non-existent dest,
+    // why?  because if it's local, it dne.  and if it's remote, we xlocked with 
+    // REQXLOCKC, which will only allow you to lock a file.
+    // so we know dest is a file, or non-existent
+    if (!destlocal) {
+      if (srcdn->inode->is_dir()) { 
+        // help: maybe the dest exists and is a file?   ..... FIXME
+      } else {
+        // we're fine, src is file, dest is file|dne
+      }
+    }
+  }
+  
+  mds->balancer->hit_dir(srcdn->dir, META_POP_DWR);
+  mds->balancer->hit_dir(destdn->dir, META_POP_DWR);
+
+  // we're golden.
+  // everything is xlocked by us, we rule, etc.
+  MClientReply *reply = new MClientReply(req, 0);
+  mdcache->renamer->file_rename( srcdn, destdn,
+				 new C_MDS_CommitRequest(this, req, reply, srcdn->inode,
+							 new EInodeUpdate(srcdn->inode)) );  // FIXME WRONG EVENT
+}
+
+
+
+
+
+
+
+// MKDIR
+
+void Server::handle_client_mkdir(MClientRequest *req, CInode *diri)
+{
+  // make dentry and inode, link.  
+  CInode *newi = mknod(req, diri);
+  if (!newi) return;
+  
+  // make my new inode a dir.
+  newi->inode.mode = req->get_iarg();
+  newi->inode.mode &= ~INODE_TYPE_MASK;
+  newi->inode.mode |= INODE_MODE_DIR;
+  
+  // use dir layout
+  newi->inode.layout = g_OSD_MDDirLayout;
+
+  // init dir to be empty
+  assert(!newi->is_frozen_dir());  // bc mknod worked
+  CDir *newdir = newi->get_or_open_dir(mds);
+  newdir->mark_complete();
+  newdir->mark_dirty();
+  
+  mds->balancer->hit_dir(newdir, META_POP_DWR);
+
+  if (
+      diri->dir->is_auth() &&
+      diri->dir->is_rep() &&
+      newdir->is_auth() &&
+      !newdir->is_hashing()) {
+    int dest = rand() % mds->mdsmap->get_num_mds();
+    if (dest != whoami) {
+      dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
+      mdcache->migrator->export_dir(newdir, dest);
+    }
+  }
+
+  // commit to log
+  commit_request(req, new MClientReply(req, 0), diri,
+                 new EMkdir(newdir));
+  //new EInodeUpdate(newi),//);
+  //new EDirUpdate(newdir));         // FIXME: weird performance regression here w/ double log; somewhat of a mystery!
+  return;
+}
+
+
+
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MClientRequest *req, CInode *diri)
+{
+  // make dentry and inode, link.  
+  CInode *newi = mknod(req, diri);
+  if (!newi) return;
+
+  // make my new inode a symlink
+  newi->inode.mode &= ~INODE_TYPE_MASK;
+  newi->inode.mode |= INODE_MODE_SYMLINK;
+  
+  // set target
+  newi->symlink = req->get_sarg();
+  
+  mds->balancer->hit_dir(diri->dir, META_POP_DWR);
+
+  // commit
+  commit_request(req, new MClientReply(req, 0), diri,
+                 new EInodeUpdate(newi));                   // FIXME should be differnet log entry
+}
+
+
+
+
+
+
+
+// ===================================
+// TRUNCATE, FSYNC
+
+/*
+ * FIXME: this truncate implemention is WRONG WRONG WRONG
+ */
+
+void Server::handle_client_truncate(MClientRequest *req, CInode *cur)
+{
+  // write
+  if (!mds->locker->inode_file_write_start(cur, req))
+    return;  // fw or (wait for) lock
+
+  // check permissions
+  
+  // do update
+  cur->inode.size = req->get_sizearg();
+  cur->mark_dirty();
+
+  mds->locker->inode_file_write_finish(cur);
+
+  mds->balancer->hit_inode(cur, META_POP_IWR);   
+
+  // start reply
+  MClientReply *reply = new MClientReply(req, 0);
+
+  // commit
+  commit_request(req, reply, cur,
+                 new EInodeUpdate(cur));
+}
+
+
+
+// ===========================
+// open, openc, close
+
+void Server::handle_client_open(MClientRequest *req,
+                             CInode *cur)
+{
+  int flags = req->get_iarg();
+  int mode = req->get_iarg2();
+
+  dout(7) << "open " << flags << " on " << *cur << endl;
+  dout(10) << "open flags = " << flags << "  mode = " << mode << endl;
+
+  // is it a file?
+  if (!(cur->inode.mode & INODE_MODE_FILE)) {
+    dout(7) << "not a regular file" << endl;
+    reply_request(req, -EINVAL);                 // FIXME what error do we want?
+    return;
+  }
+
+  // auth for write access
+  if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY &&
+      !cur->is_auth()) {
+    int auth = cur->authority();
+    assert(auth != whoami);
+    dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl;
+    
+    mdcache->request_forward(req, auth);
+    return;
+  }
+
+
+  // hmm, check permissions or something.
+
+
+  // can we issue the caps they want?
+  version_t fdv = mds->locker->issue_file_data_version(cur);
+  Capability *cap = mds->locker->issue_new_caps(cur, mode, req);
+  if (!cap) return; // can't issue (yet), so wait!
+
+  dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl;
+
+  mds->balancer->hit_inode(cur, META_POP_IRD);
+
+  // reply
+  MClientReply *reply = new MClientReply(req, 0);
+  reply->set_file_caps(cap->pending());
+  reply->set_file_caps_seq(cap->get_last_seq());
+  reply->set_file_data_version(fdv);
+  reply_request(req, reply, cur);
+}
+
+
+
+void Server::handle_client_openc(MClientRequest *req, CInode *ref)
+{
+  dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl;
+
+  CInode *in = mknod(req, ref, true);
+  if (!in) return;
+
+  in->inode.mode = 0644;              // wtf FIXME
+  in->inode.mode |= INODE_MODE_FILE;
+
+  handle_client_open(req, in);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h
new file mode 100644
index 0000000000000..912af31ca909a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/Server.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_SERVER_H
+#define __MDS_SERVER_H
+
+#include "MDS.h"
+
+class LogEvent;
+
+class Server {
+  MDS *mds;
+  MDCache *mdcache;
+  MDLog *mdlog;
+  Messenger *messenger;
+  int whoami;
+
+  __uint64_t stat_ops;
+
+
+public:
+  Server(MDS *m) : 
+    mds(m), 
+    mdcache(mds->mdcache), mdlog(mds->mdlog),
+    messenger(mds->messenger), whoami(mds->get_nodeid()),
+    stat_ops(0) {
+  }
+
+  void dispatch(Message *m);
+
+  // generic request helpers
+  void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0);
+  void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei);
+  void commit_request(MClientRequest *req,
+                      MClientReply *reply,
+                      CInode *tracei,
+                      LogEvent *event,
+                      LogEvent *event2 = 0);
+  
+  bool try_open_dir(CInode *in, MClientRequest *req);
+
+
+  // clients
+  void handle_client_mount(class MClientMount *m);
+  void handle_client_unmount(Message *m);
+
+  void handle_client_request(MClientRequest *m);
+  void handle_client_request_2(MClientRequest *req, 
+                               vector<CDentry*>& trace,
+                               int r);
+  
+  // fs ops
+  void handle_client_fstat(MClientRequest *req);
+
+  // requests
+  void dispatch_request(Message *m, CInode *ref);
+
+  // inode request *req, CInode *ref;
+  void handle_client_stat(MClientRequest *req, CInode *ref);
+  void handle_client_utime(MClientRequest *req, CInode *ref);
+  void handle_client_inode_soft_update_2(MClientRequest *req,
+                                         MClientReply *reply,
+                                         CInode *ref);
+  void handle_client_chmod(MClientRequest *req, CInode *ref);
+  void handle_client_chown(MClientRequest *req, CInode *ref);
+  void handle_client_inode_hard_update_2(MClientRequest *req,
+                                         MClientReply *reply,
+                                         CInode *ref);
+
+  // readdir
+  void handle_client_readdir(MClientRequest *req, CInode *ref);
+  int encode_dir_contents(CDir *dir, 
+                          list<class InodeStat*>& inls,
+                          list<string>& dnls);
+  void handle_hash_readdir(MHashReaddir *m);
+  void handle_hash_readdir_reply(MHashReaddirReply *m);
+  void finish_hash_readdir(MClientRequest *req, CDir *dir); 
+
+  // namespace changes
+  void handle_client_mknod(MClientRequest *req, CInode *ref);
+  void handle_client_link(MClientRequest *req, CInode *ref);
+  void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace);
+  void handle_client_link_finish(MClientRequest *req, CInode *ref,
+                                 CDentry *dn, CInode *targeti);
+
+  void handle_client_unlink(MClientRequest *req, CInode *ref);
+  void handle_client_rename(MClientRequest *req, CInode *ref);
+  void handle_client_rename_2(MClientRequest *req,
+                              CInode *ref,
+                              CInode *srcdiri,
+                              CDir *srcdir,
+                              CDentry *srcdn,
+                              filepath& destpath,
+                              vector<CDentry*>& trace,
+                              int r);
+  void handle_client_rename_local(MClientRequest *req, CInode *ref,
+                                  string& srcpath, CInode *srcdiri, CDentry *srcdn, 
+                                  string& destpath, CDir *destdir, CDentry *destdn, string& name);
+
+  void handle_client_mkdir(MClientRequest *req, CInode *ref);
+  void handle_client_rmdir(MClientRequest *req, CInode *ref);
+  void handle_client_symlink(MClientRequest *req, CInode *ref);
+
+  // file
+  void handle_client_open(MClientRequest *req, CInode *ref);
+  void handle_client_openc(MClientRequest *req, CInode *ref);
+  void handle_client_release(MClientRequest *req, CInode *in);  
+  void handle_client_truncate(MClientRequest *req, CInode *in);
+  void handle_client_fsync(MClientRequest *req, CInode *in);
+
+  CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false);  // used by mknod, symlink, mkdir, openc
+
+
+};
+
+class C_MDS_RetryRequest : public Context {
+  MDS *mds;
+  Message *req;   // MClientRequest or MLock
+  CInode *ref;
+ public:
+  C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) {
+    assert(ref);
+    this->mds = mds;
+    this->req = req;
+    this->ref = ref;
+  }
+  virtual void finish(int r) {
+    mds->server->dispatch_request(req, ref);
+  }
+};
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EAlloc.h b/branches/sage/cephmds2/mds/events/EAlloc.h
new file mode 100644
index 0000000000000..b3b5f21f84038
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EAlloc.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_EALLOC_H
+#define __MDS_EALLOC_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "../IdAllocator.h"
+
+#define EALLOC_EV_ALLOC  1
+#define EALLOC_EV_FREE   2
+
+class EAlloc : public LogEvent {
+ protected:
+  int  idtype;
+  idno_t id;
+  int  what;  // alloc or dealloc
+  version_t table_version;
+
+ public:
+  EAlloc() : LogEvent(EVENT_ALLOC) { }
+  EAlloc(int idtype, idno_t id, int what, version_t v) :
+    LogEvent(EVENT_ALLOC) {
+    this->idtype = idtype;
+    this->id = id;
+    this->what = what;
+    this->table_version = v;
+  }
+  
+  void encode_payload(bufferlist& bl) {
+    bl.append((char*)&idtype, sizeof(idtype));
+    bl.append((char*)&id, sizeof(id));
+    bl.append((char*)&what, sizeof(what));
+    bl.append((char*)&table_version, sizeof(table_version));
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(idtype), (char*)&idtype);
+    off += sizeof(idtype);
+    bl.copy(off, sizeof(id), (char*)&id);
+    off += sizeof(id);
+    bl.copy(off, sizeof(what), (char*)&what);
+    off += sizeof(what);
+    bl.copy(off, sizeof(table_version), (char*)&table_version);
+    off += sizeof(table_version);
+  }
+
+
+  void print(ostream& out) {
+    if (what == EALLOC_EV_ALLOC) 
+      out << "alloc " << hex << id << dec << " tablev " << table_version;
+    else
+      out << "dealloc " << hex << id << dec << " tablev " << table_version;
+  }
+  
+
+  // live journal
+  bool can_expire(MDS *mds) {
+    if (mds->idalloc->get_committed_version() < table_version)
+      return false;   // still dirty
+    else
+      return true;    // already flushed
+  }
+  
+  void retire(MDS *mds, Context *c) {
+    mds->idalloc->save(c, table_version);
+  }
+  
+  
+  // recovery
+  bool has_happened(MDS *mds) {
+    if (mds->idalloc->get_version() >= table_version) {
+      cout << " event " << table_version << " <= table " << mds->idalloc->get_version() << endl;
+      return true;
+    } else 
+      return false;
+  }
+
+  void replay(MDS *mds) {
+    assert(table_version-1 == mds->idalloc->get_version());
+    
+    if (what == EALLOC_EV_ALLOC) {
+      idno_t nid = mds->idalloc->alloc_id(true);
+      assert(nid == id);       // this should match.
+    } 
+    else if (what == EALLOC_EV_FREE) {
+      mds->idalloc->reclaim_id(id, true);
+    } 
+    else
+      assert(0);
+    
+    assert(table_version == mds->idalloc->get_version());
+  }
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EDirUpdate.h b/branches/sage/cephmds2/mds/events/EDirUpdate.h
new file mode 100644
index 0000000000000..9c8881d4c91b9
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EDirUpdate.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EDIRUPDATE_H
+#define __EDIRUPDATE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+#include "../CDir.h"
+#include "../MDCache.h"
+#include "../MDStore.h"
+
+
+
+class EDirUpdate : public LogEvent {
+ protected:
+  ETrace trace;
+  inodeno_t dirino;
+  version_t version;
+
+ public:
+  EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE),
+			  trace(dir->inode) {
+    this->dirino = dir->ino();
+    version = dir->get_version();
+  }
+  EDirUpdate() : LogEvent(EVENT_DIRUPDATE) {
+  }
+  
+  void print(ostream& out) {
+    out << "up dir " << dirino << " "
+	<< trace
+	<< "/ v " << version;
+  }
+
+  virtual void encode_payload(bufferlist& bl) {
+    trace.encode(bl);
+    bl.append((char*)&version, sizeof(version));
+    bl.append((char*)&dirino, sizeof(dirino));
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    trace.decode(bl, off);
+    bl.copy(off, sizeof(version), (char*)&version);
+    off += sizeof(version);
+    bl.copy(off, sizeof(dirino), (char*)&dirino);
+    off += sizeof(dirino);
+  }
+
+  
+  virtual bool can_expire(MDS *mds) {
+    // am i obsolete?
+    CInode *in = mds->mdcache->get_inode(dirino);
+    if (!in) return true;
+    CDir *dir = in->dir;
+    if (!dir) return true;
+
+    dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl;
+
+    if (!dir->is_auth()) return true;     // not mine!
+    if (dir->is_frozen()) return true;    // frozen -> exporting -> obsolete? FIXME
+    
+    if (!dir->is_dirty()) return true;
+
+    if (dir->get_committing_version() > version)
+      return true;
+
+    return false;
+  }
+
+  virtual void retire(MDS *mds, Context *c) {
+    // commit directory
+    CInode *in = mds->mdcache->get_inode(dirino);
+    assert(in);
+    CDir *dir = in->dir;
+    assert(dir);
+
+    dout(10) << "EDirUpdate committing dir " << *dir << endl;
+    mds->mdstore->commit_dir(dir, c);
+  }
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EInodeUpdate.h b/branches/sage/cephmds2/mds/events/EInodeUpdate.h
new file mode 100644
index 0000000000000..dba233c833883
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EInodeUpdate.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EINODEUPDATE_H
+#define __EINODEUPDATE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+
+
+class EInodeUpdate : public LogEvent {
+ protected:
+  ETrace trace;
+
+ public:
+  EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE),
+			     trace(in) {
+  }
+  EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { }
+  
+  void print(ostream& out) {
+    out << "up inode " << trace.back().inode.ino 
+	<< " " << trace 
+	<< " v " << trace.back().inode.version;    
+  }
+  
+  virtual void encode_payload(bufferlist& bl) {
+    trace.encode(bl);
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    trace.decode(bl, off);
+  }
+
+  bool can_expire(MDS *mds);
+  void retire(MDS *mds, Context *c);
+  bool has_happened(MDS *mds);  
+  void replay(MDS *mds);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EMkdir.h b/branches/sage/cephmds2/mds/events/EMkdir.h
new file mode 100644
index 0000000000000..f7f9c05c2207c
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EMkdir.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EMKDIR_H
+#define __EMKDIR_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "ETrace.h"
+#include "../MDS.h"
+#include "../MDStore.h"
+
+
+class EMkdir : public LogEvent {
+ protected:
+  ETrace trace;
+  //version_t pdirv;
+
+ public:
+  EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR),
+		      trace(dir->inode) {
+    //pdirv = dir->inode->get_parent_dir()->get_version();
+  }
+  EMkdir() : LogEvent(EVENT_MKDIR) { }
+  
+  void print(ostream& out) {
+    out << "mkdir ";
+    trace.print(out);
+  }
+
+  virtual void encode_payload(bufferlist& bl) {
+    trace.encode(bl);
+    //bl.append((char*)&pdirv, sizeof(pdirv));
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    trace.decode(bl, off);
+    //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
+    //off += sizeof(pdirv);
+  }
+  
+  bool can_expire(MDS *mds);
+  void retire(MDS *mds, Context *c);
+
+  // recovery
+  bool has_happened(MDS *mds);  
+  void replay(MDS *mds);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EMknod.h b/branches/sage/cephmds2/mds/events/EMknod.h
new file mode 100644
index 0000000000000..27ade4671a0c7
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EMknod.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EMKNOD_H
+#define __EMKNOD_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+#include "../MDS.h"
+#include "../MDStore.h"
+
+
+class EMknod : public LogEvent {
+ protected:
+  ETrace trace;
+  //version_t pdirv;
+
+ public:
+  EMknod(CInode *in) : LogEvent(EVENT_MKNOD), 
+		       trace(in) {
+    //pdirv = in->get_parent_dir()->get_version();
+  }
+  EMknod() : LogEvent(EVENT_MKNOD) { }
+  
+  void print(ostream& out) {
+    out << "mknod " << trace;
+  }
+
+  virtual void encode_payload(bufferlist& bl) {
+    trace.encode(bl);
+    //bl.append((char*)&pdirv, sizeof(pdirv));
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    trace.decode(bl, off);
+    //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
+    //off += sizeof(pdirv);
+  }  
+
+  bool can_expire(MDS *mds);
+  void retire(MDS *mds, Context *c);
+  bool has_happened(MDS *mds);  
+  void replay(MDS *mds);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EPurgeFinish.h b/branches/sage/cephmds2/mds/events/EPurgeFinish.h
new file mode 100644
index 0000000000000..bacfa8e93c737
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EPurgeFinish.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EPURGE_H
+#define __EPURGE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+class EPurgeFinish : public LogEvent {
+ protected:
+  inodeno_t ino;
+
+ public:
+  EPurgeFinish(inodeno_t i) : 
+	LogEvent(EVENT_PURGEFINISH),
+	ino(i) { }
+  EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { }
+  
+  void print(ostream& out) {
+    out << "purgefinish " << ino;
+  }
+
+  virtual void encode_payload(bufferlist& bl) {
+    bl.append((char*)&ino, sizeof(ino));
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    bl.copy(off, sizeof(ino), (char*)&ino);
+  }
+  
+  bool can_expire(MDS *mds);
+  void retire(MDS *mds, Context *c);
+  bool has_happened(MDS *mds);  
+  void replay(MDS *mds);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EString.h b/branches/sage/cephmds2/mds/events/EString.h
new file mode 100644
index 0000000000000..6bd10030549ba
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EString.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __ESTRING_H
+#define __ESTRING_H
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+#include "../LogEvent.h"
+
+// generic log event
+class EString : public LogEvent {
+ protected:
+  string event;
+
+ public:
+  EString(string e) :
+    LogEvent(EVENT_STRING) {
+    event = e;
+  }
+  EString() :
+    LogEvent(EVENT_STRING) {
+  }
+
+  void decode_payload(bufferlist& bl, int& off) {
+    event = bl.c_str() + off;
+    off += event.length() + 1;
+  }
+  
+  void encode_payload(bufferlist& bl) {
+    bl.append(event.c_str(), event.length()+1);
+  }
+
+  void print(ostream& out) {
+    out << '"' << event << '"';
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/ETrace.h b/branches/sage/cephmds2/mds/events/ETrace.h
new file mode 100644
index 0000000000000..a320137512178
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/ETrace.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDS_ETRACE_H
+#define __MDS_ETRACE_H
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+
+
+// path trace for use in journal events
+
+class ETrace {
+
+  // <dir, dn, inode> segment.
+  struct bit {
+    inodeno_t dirino;
+    version_t dirv;
+    string dn;
+    inode_t inode;
+    
+    bit(bufferlist& bl, int& off) { _decode(bl,off); }
+    bit(inodeno_t di, version_t dv, const string& d, inode_t i) :
+      dirino(di), dirv(dv), dn(d), inode(i) {}
+    
+    void _encode(bufferlist& bl) {
+      bl.append((char*)&dirino, sizeof(dirino));
+      bl.append((char*)&dirv, sizeof(dirv));
+      ::_encode(dn, bl);
+      bl.append((char*)&inode, sizeof(inode));
+    }
+    void _decode(bufferlist& bl, int& off) {
+      bl.copy(off, sizeof(dirino), (char*)&dirino);  off += sizeof(dirino);
+      bl.copy(off, sizeof(dirv), (char*)&dirv);  off += sizeof(dirv);
+      ::_decode(dn, bl, off);
+      bl.copy(off, sizeof(inode), (char*)&inode);  off += sizeof(inode);
+    }
+  };
+
+ public:
+  list<bit> trace;
+
+  ETrace(CInode *in = 0) { 
+    if (in) {
+      CDir *dir;
+      CDentry *dn;
+      do {
+	dn = in->get_parent_dn();
+	if (!dn) break;
+	dir = dn->get_dir();
+	if (!dir) break;      
+	
+	trace.push_front(bit(dir->ino(),
+			     dir->get_version(),
+			     dn->get_name(),
+			     in->inode));
+	
+	in = dir->get_inode();
+      } while (!dir->is_import());
+    }
+  }
+  
+  bit& back() {
+    return trace.back();
+  }
+
+  void decode(bufferlist& bl, int& off) {
+    int n;
+    bl.copy(off, sizeof(n), (char*)&n);
+    off += sizeof(n);
+    for (int i=0; i<n; i++) 
+      trace.push_back( bit(bl, off) );
+  }
+  
+  void encode(bufferlist& bl) {
+    int n = trace.size();
+    bl.append((char*)&n, sizeof(n));
+    for (list<bit>::iterator i = trace.begin();
+	 i != trace.end();
+	 i++)
+      i->_encode(bl);
+  }
+  
+  void print(ostream& out) const {
+    for (list<bit>::const_iterator p = trace.begin();
+	 p != trace.end();
+	 p++) {
+      if (p == trace.begin()) 
+	out << "[" << p->dirino << "]/" << p->dn;
+      else 
+	out << "/" << p->dn;
+    }
+  }
+  
+  CInode *restore_trace(MDS *mds);
+  
+};
+
+inline ostream& operator<<(ostream& out, const ETrace& t) {
+  t.print(out);
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/mds/events/EUnlink.h b/branches/sage/cephmds2/mds/events/EUnlink.h
new file mode 100644
index 0000000000000..9b7484174886a
--- /dev/null
+++ b/branches/sage/cephmds2/mds/events/EUnlink.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __EUNLINK_H
+#define __EUNLINK_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+
+#include "../CInode.h"
+#include "../CDentry.h"
+#include "../CDir.h"
+
+class EUnlink : public LogEvent {
+ protected:
+  ETrace diritrace;
+  version_t dirv;
+  string dname;
+  ETrace inodetrace;
+
+ public:
+  EUnlink(CDir *dir, CDentry* dn, CInode *in) :
+    LogEvent(EVENT_UNLINK),
+    diritrace(dir->inode), 
+    dirv(dir->get_version()),
+    dname(dn->get_name()),
+    inodetrace(in) {}
+  EUnlink() : LogEvent(EVENT_UNLINK) { }
+  
+  virtual void encode_payload(bufferlist& bl) {
+    diritrace.encode(bl);
+    bl.append((char*)&dirv, sizeof(dirv));
+    ::_encode(dname, bl);
+    inodetrace.encode(bl);
+  }
+  void decode_payload(bufferlist& bl, int& off) {
+    diritrace.decode(bl,off);
+    bl.copy(off, sizeof(dirv), (char*)&dirv);
+    off += sizeof(dirv);
+    ::_decode(dname, bl, off);
+    inodetrace.decode(bl, off);
+  }
+  
+  bool can_expire(MDS *mds);
+  void retire(MDS *mds, Context *c);
+  bool has_happened(MDS *mds);  
+  void replay(MDS *mds);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc
new file mode 100644
index 0000000000000..9ac2406e2cbc2
--- /dev/null
+++ b/branches/sage/cephmds2/mds/journal.cc
@@ -0,0 +1,345 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "events/ETrace.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+#include "events/EInodeUpdate.h"
+#include "events/EPurgeFinish.h"
+#include "events/EUnlink.h"
+
+#include "MDS.h"
+#include "MDCache.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+#define  derr(l)    if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+
+
+// -----------------------
+// ETrace
+
+CInode *ETrace::restore_trace(MDS *mds) 
+{
+  CInode *in = 0;
+  for (list<bit>::iterator p = trace.begin();
+       p != trace.end();
+       ++p) {
+    // the dir 
+    CInode *diri = mds->mdcache->get_inode(p->dirino);
+    if (!diri) {
+      dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl;
+      diri = new CInode(mds->mdcache);
+      diri->inode.ino = p->dirino;
+      diri->inode.mode = INODE_MODE_DIR;
+      mds->mdcache->add_inode(diri);
+
+      CDir *dir = diri->get_or_open_dir(mds);
+
+      // root?  import?
+      if (p == trace.begin()) {
+	mds->mdcache->add_import(dir);
+	if (dir->ino() == 1) 
+	  mds->mdcache->set_root(diri);
+      }
+    } else {
+      dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl;
+      diri->get_or_open_dir(mds);
+    }
+    assert(diri->dir);
+    dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl;
+    
+    // the inode
+    in = mds->mdcache->get_inode(p->inode.ino);
+    if (!in) {
+      dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl;
+      in = new CInode(mds->mdcache);
+      in->inode = p->inode;
+      mds->mdcache->add_inode(in);
+      
+      // the dentry
+      CDentry *dn = diri->dir->add_dentry( p->dn, in );
+      dn->mark_dirty();
+      assert(dn);
+    } else {
+      dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl;
+      in->inode = p->inode;
+    }
+    dout(20) << "ETrace.restore_trace in is " << *in << endl;
+  }
+  return in;
+}
+
+
+// -----------------------
+// EMkdir
+// - trace goes to new dir's inode.
+
+bool EMkdir::can_expire(MDS *mds) 
+{
+  // am i obsolete?
+  CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+  if (!in) return true;
+  CDir *dir = in->dir;
+  if (!dir) return true;
+  CDir *pdir = in->get_parent_dir();
+  assert(pdir);
+  
+  dout(10) << "EMkdir.can_expire  in is " << *in << endl;
+  dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl;
+  dout(10) << "EMkdir.can_expire dir is " << *dir << endl;
+  bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
+  bool commitnew = dir->get_last_committed_version() == 0;
+
+  if (commitparent || commitnew) return false;
+  return true;
+}
+
+void EMkdir::retire(MDS *mds, Context *c) 
+{
+  // commit parent dir AND my dir
+  CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+  assert(in);
+  CDir *dir = in->dir;
+  assert(dir);
+  CDir *pdir = in->get_parent_dir();
+  assert(pdir);
+  
+  dout(10) << "EMkdir.retire  in is " << *in << endl;
+  dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl;
+  dout(10) << "EMkdir.retire dir is " << *dir << endl;
+  bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
+  bool commitnew = dir->get_last_committed_version() == 0;
+  
+  if (commitparent && commitnew) {
+    // both
+    dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl;
+    C_Gather *gather = new C_Gather(c);
+    mds->mdstore->commit_dir(pdir, gather->new_sub());
+    mds->mdstore->commit_dir(dir, gather->new_sub());
+  } else if (commitparent) {
+    // just parent
+    dout(10) << "EMkdir.retire committing parent dir " << *dir << endl;
+    mds->mdstore->commit_dir(pdir, c);
+  } else {
+    // just new dir
+    dout(10) << "EMkdir.retire committing new dir " << *dir << endl;
+    mds->mdstore->commit_dir(dir, c);
+  }
+}
+
+bool EMkdir::has_happened(MDS *mds) 
+{
+  return false;     
+}
+  
+void EMkdir::replay(MDS *mds) 
+{
+  dout(10) << "EMkdir.replay " << *this << endl;
+  CInode *in = trace.restore_trace(mds);
+
+  // mark dir inode dirty
+  in->mark_dirty();
+
+  // mark parent dir dirty, and set version.  
+  // this may end up being below water when dir is fetched from disk.
+  CDir *pdir = in->get_parent_dir();
+  if (!pdir->is_dirty()) pdir->mark_dirty();
+  pdir->set_version(trace.back().dirv);
+ 
+  // mark new dir dirty + complete
+  CDir *dir = in->get_or_open_dir(mds);
+  dir->mark_dirty();
+  dir->mark_complete();
+}
+
+
+
+// -----------------------
+// EMknod
+  
+bool EMknod::can_expire(MDS *mds) 
+{
+  // am i obsolete?
+  CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+  if (!in) return true;
+
+  if (!in->is_auth()) return true;  // not my inode anymore!
+  if (in->get_version() != trace.back().inode.version)
+    return true;  // i'm obsolete!  (another log entry follows)
+
+  if (in->get_last_committed_version() >= trace.back().inode.version)
+    return true;
+
+  return false;
+}
+
+void EMknod::retire(MDS *mds, Context *c) 
+{
+  // commit parent directory
+  CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
+  assert(diri);
+  CDir *dir = diri->dir;
+  assert(dir);
+
+  dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
+  mds->mdstore->commit_dir(dir, c);
+}
+
+bool EMknod::has_happened(MDS *mds) 
+{
+  return false;
+}
+  
+void EMknod::replay(MDS *mds) 
+{
+  dout(10) << "EMknod.replay " << *this << endl;
+  CInode *in = trace.restore_trace(mds);
+  in->mark_dirty();
+
+  // mark parent dir dirty, and set version.  
+  // this may end up being below water when dir is fetched from disk.
+  CDir *pdir = in->get_parent_dir();
+  if (!pdir->is_dirty()) pdir->mark_dirty();
+  pdir->set_version(trace.back().dirv);
+}
+
+
+
+// -----------------------
+// EInodeUpdate
+
+bool EInodeUpdate::can_expire(MDS *mds) 
+{
+  CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+  if (!in) return true;
+
+  if (!in->is_auth()) return true;  // not my inode anymore!
+  if (in->get_version() != trace.back().inode.version)
+    return true;  // i'm obsolete!  (another log entry follows)
+
+  /*
+  // frozen -> exporting -> obsolete    (FOR NOW?)
+  if (in->is_frozen())
+  return true; 
+  */
+
+  if (in->get_last_committed_version() >= trace.back().inode.version)
+    return true;
+
+  return false;
+}
+
+void EInodeUpdate::retire(MDS *mds, Context *c) 
+{
+   // commit parent directory
+  CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
+  assert(diri);
+  CDir *dir = diri->dir;
+  assert(dir);
+
+  dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
+  mds->mdstore->commit_dir(dir, c);
+}
+  
+bool EInodeUpdate::has_happened(MDS *mds)
+{
+  return false;
+}
+
+void EInodeUpdate::replay(MDS *mds) 
+{
+  dout(10) << "EInodeUpdate.replay " << *this << endl;
+  CInode *in = trace.restore_trace(mds);
+  in->mark_dirty();
+
+  // mark parent dir dirty, and set version.  
+  // this may end up being below water when dir is fetched from disk.
+  CDir *pdir = in->get_parent_dir();
+  if (!pdir->is_dirty()) pdir->mark_dirty();
+  pdir->set_version(trace.back().dirv);
+}
+
+
+
+// -----------------------
+// EUnlink
+
+bool EUnlink::can_expire(MDS *mds)
+{
+  // dir
+  CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
+  CDir *dir = 0;
+  if (diri) dir = diri->dir;
+
+  if (dir && dir->get_last_committed_version() < dirv) return false;
+
+  if (!inodetrace.trace.empty()) {
+    // inode
+    CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino );
+    if (in && in->get_last_committed_version() < inodetrace.back().inode.version)
+      return false;
+  }
+
+  return true;
+}
+
+void EUnlink::retire(MDS *mds, Context *c)
+{
+  CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
+  CDir *dir = diri->dir;
+  assert(dir);
+  
+  // okay!
+  dout(7) << "commiting dirty (from unlink) dir " << *dir << endl;
+  mds->mdstore->commit_dir(dir, dirv, c);
+}
+
+bool EUnlink::has_happened(MDS *mds)
+{
+  return true;
+}
+
+void EUnlink::replay(MDS *mds)
+{
+}
+
+
+
+
+// -----------------------
+// EPurgeFinish
+
+
+bool EPurgeFinish::can_expire(MDS *mds)
+{
+  return true;
+}
+
+void EPurgeFinish::retire(MDS *mds, Context *c)
+{
+}
+
+bool EPurgeFinish::has_happened(MDS *mds)
+{
+  return true;
+}
+
+void EPurgeFinish::replay(MDS *mds)
+{
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h
new file mode 100644
index 0000000000000..b448123bf929e
--- /dev/null
+++ b/branches/sage/cephmds2/mds/mdstypes.h
@@ -0,0 +1,135 @@
+#ifndef __MDSTYPES_H
+#define __MDSTYPES_H
+
+
+#include <math.h>
+#include <ostream>
+using namespace std;
+
+#include "config.h"
+#include "common/DecayCounter.h"
+
+#include <cassert>
+
+
+/* meta_load_t
+ * hierarchical load for an inode/dir and it's children
+ */
+#define META_POP_IRD    0
+#define META_POP_IWR    1
+#define META_POP_DWR    2
+//#define META_POP_LOG   3
+//#define META_POP_FDIR  4
+//#define META_POP_CDIR  4
+#define META_NPOP      3
+
+class meta_load_t {
+ public:
+  DecayCounter pop[META_NPOP];
+
+  double meta_load() {
+    return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get();
+  }
+
+  void take(meta_load_t& other) {
+    for (int i=0; i<META_NPOP; i++) {
+      pop[i] = other.pop[i];
+      other.pop[i].reset();
+    }
+  }
+};
+
+inline ostream& operator<<( ostream& out, meta_load_t& load )
+{
+  return out << "metaload<rd " << load.pop[META_POP_IRD].get()
+             << ", wr " << load.pop[META_POP_IWR].get()
+             << ">";
+}
+
+
+inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r)
+{
+  for (int i=0; i<META_NPOP; i++)
+    l.pop[i].adjust(- r.pop[i].get());
+  return l;
+}
+
+inline meta_load_t& operator+=(meta_load_t& l, meta_load_t& r)
+{
+  for (int i=0; i<META_NPOP; i++)
+    l.pop[i].adjust(r.pop[i].get());
+  return l;
+}
+
+
+
+/* mds_load_t
+ * mds load
+ */
+
+// popularity classes
+#define MDS_POP_JUSTME  0   // just me (this dir or inode)
+#define MDS_POP_NESTED  1   // me + children, auth or not
+#define MDS_POP_CURDOM  2   // me + children in current auth domain
+#define MDS_POP_ANYDOM  3   // me + children in any (nested) auth domain
+//#define MDS_POP_DIRMOD  4   // just this dir, modifications only
+#define MDS_NPOP        4
+
+class mds_load_t {
+ public:
+  meta_load_t root;
+
+  double req_rate;
+  double cache_hit_rate;
+  double queue_len;
+
+  mds_load_t() : 
+    req_rate(0), cache_hit_rate(0), queue_len(0) { }    
+
+  double mds_load() {
+    switch(g_conf.mds_bal_mode) {
+    case 0: 
+      return root.meta_load()
+        + req_rate
+        + 10.0*queue_len;
+
+    case 1:
+      return req_rate + 10.0*queue_len;
+    }
+    assert(0);
+    return 0;
+  }
+
+};
+
+
+inline ostream& operator<<( ostream& out, mds_load_t& load )
+{
+  return out << "mdsload<" << load.root
+             << ", req " << load.req_rate 
+             << ", hr " << load.cache_hit_rate
+             << ", qlen " << load.queue_len
+             << ">";
+}
+
+/*
+inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) 
+{
+  l.root_pop += r.root_pop;
+  l.req_rate += r.req_rate;
+  l.queue_len += r.queue_len;
+  return l;
+}
+
+inline mds_load_t operator/( mds_load_t& a, double d ) 
+{
+  mds_load_t r;
+  r.root_pop = a.root_pop / d;
+  r.req_rate = a.req_rate / d;
+  r.queue_len = a.queue_len / d;
+  return r;
+}
+*/
+
+
+#endif
diff --git a/branches/sage/cephmds2/mds/oldcachestuff.cc b/branches/sage/cephmds2/mds/oldcachestuff.cc
new file mode 100644
index 0000000000000..31bb9eaa81e3d
--- /dev/null
+++ b/branches/sage/cephmds2/mds/oldcachestuff.cc
@@ -0,0 +1,944 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+/*
+
+
+OLD LOCK CRAP:
+ (old):
+  sync -  soft metadata.. no reads/writes can proceed.  (eg no stat)
+  lock -  hard(+soft) metadata.. path traversals stop etc.  (??)
+
+
+ replication consistency modes:
+  hard+soft - hard and soft are defined on all replicas.
+              all reads proceed (in absense of sync lock)
+              writes require sync lock, fw to auth
+   -> normal behavior.
+
+  hard      - hard only, soft is undefined
+              reads require a sync
+              writes proceed if field updates are monotonic (e.g. size, m/c/atime)
+   -> 'softasync'
+
+ types of access by cache users:
+
+   hard   soft
+    R      -    read_hard_try       path traversal
+    R  <=  R    read_soft_start     stat
+    R  <=  W    write_soft_start    touch
+    W  =>  W    write_hard_start    chmod
+
+   note on those implications:
+     read_soft_start() calls read_hard_try()
+     write_soft_start() calls read_hard_try()
+     a hard lock implies/subsumes a soft sync  (read_soft_start() returns true if a 
+                      lock is held)
+
+
+ relationship with frozen directories:
+
+   read_hard_try - can proceed, because any hard changes require a lock, which 
+      requires an active authority, which implies things are unfrozen.
+   write_hard_start - waits (has to; only auth can initiate)
+   read_soft_start  - ???? waits for now.  (FIXME: if !softasync & !syncbyauth)
+   write_soft_start - ???? waits for now.  (FIXME: if (softasync & !syncbyauth))
+
+   if sticky is on, an export_dir will drop any sync or lock so that the freeze will 
+   proceed (otherwise, deadlock!).  likewise, a sync will not stick if is_freezing().
+   
+
+
+NAMESPACE:
+
+ none right now.
+
+ 
+*/
+
+
+/* soft sync locks: mtime, size, etc. 
+ */
+
+bool MDCache::read_soft_start(CInode *in, Message *m)
+{
+  //  if (!read_hard_try(in, m))
+  //    return false;
+
+  // if frozen: i can't proceed (for now, see above)
+  if (in->is_frozen()) {
+    dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
+    in->add_waiter(CDIR_WAIT_UNFREEZE,
+                   new C_MDS_RetryMessage(mds, m));
+    return false;
+  }
+
+
+  dout(5) << "read_soft_start " << *in << endl;
+
+  // what soft sync mode?
+
+  if (in->is_softasync()) {
+    // softasync: hard consistency only
+
+    if (in->is_auth()) {
+      // i am auth: i need sync
+      if (in->is_syncbyme()) goto yes;
+      if (in->is_lockbyme()) goto yes;   // lock => sync
+      if (!in->is_cached_by_anyone() &&
+          !in->is_open_write()) goto yes;  // i'm alone
+    } else {
+      // i am replica: fw to auth
+      int auth = in->authority();
+      dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl;
+      assert(auth != mds->get_nodeid());
+      mds->messenger->send_message(m,
+                                   MSG_ADDR_MDS(auth), m->get_dest_port(),
+                                   MDS_PORT_CACHE);
+      return false;      
+    }
+  } else {
+    // normal: soft+hard consistency
+
+    if (in->is_syncbyauth()) {
+      // wait for sync
+    } else {
+      // i'm consistent 
+      goto yes;
+    }
+  }
+
+  // we need sync
+  if (in->is_syncbyauth() && !in->is_softasync()) {
+    dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl;
+  } else if (in->is_softasync() && in->is_auth()) {
+    dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl;
+  } else 
+    assert(2+2==5);
+
+  if (!in->can_auth_pin()) {
+    dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl;
+    in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+                   new C_MDS_RetryMessage(mds,m));
+    return false;
+  }
+
+  if (in->is_auth()) {
+    // wait for sync
+    in->add_waiter(CINODE_WAIT_SYNC,
+                   new C_MDS_RetryMessage(mds, m));
+
+    if (!in->is_presync())
+      inode_sync_start(in);
+  } else {
+    // wait for unsync
+    in->add_waiter(CINODE_WAIT_UNSYNC,
+                   new C_MDS_RetryMessage(mds, m));
+
+    assert(in->is_syncbyauth());
+
+    if (!in->is_waitonunsync())
+      inode_sync_wait(in);
+  }
+  
+  return false;
+
+ yes:
+  mds->balancer->hit_inode(in, MDS_POP_SOFTRD);
+  mds->balancer->hit_inode(in, MDS_POP_ANY);
+  return true;
+}
+
+
+int MDCache::read_soft_finish(CInode *in)
+{
+  dout(5) << "read_soft_finish " << *in << endl;   // " soft_sync_count " << in->soft_sync_count << endl;
+  return 0;  // do nothing, actually..
+}
+
+
+bool MDCache::write_soft_start(CInode *in, Message *m)
+{
+  //  if (!read_hard_try(in, m))
+  //return false;
+
+  // if frozen: i can't proceed (for now, see above)
+  if (in->is_frozen()) {
+    dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
+    in->add_waiter(CDIR_WAIT_UNFREEZE,
+                   new C_MDS_RetryMessage(mds, m));
+    return false;
+  }
+
+  dout(5) << "write_soft_start " << *in << endl;
+  // what soft sync mode?
+
+  if (in->is_softasync()) {
+    // softasync: hard consistency only
+
+    if (in->is_syncbyauth()) {
+      // wait for sync release
+    } else {
+      // i'm inconsistent; write away!
+      goto yes;
+    }
+
+  } else {
+    // normal: soft+hard consistency
+    
+    if (in->is_auth()) {
+      // i am auth: i need sync
+      if (in->is_syncbyme()) goto yes;
+      if (in->is_lockbyme()) goto yes;   // lock => sync
+      if (!in->is_cached_by_anyone() &&
+          !in->is_open_write()) goto yes;  // i'm alone
+    } else {
+      // i am replica: fw to auth
+      int auth = in->authority();
+      dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl;
+      assert(auth != mds->get_nodeid());
+      mds->messenger->send_message(m,
+                                   MSG_ADDR_MDS(auth), m->get_dest_port(),
+                                   MDS_PORT_CACHE);
+      return false;      
+    }
+  }
+
+  // we need sync
+  if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) {
+    dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl;
+  } else if (!in->is_softasync() && in->is_auth()) {
+    dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl;
+  } else 
+    assert(2+2==5);
+
+  if (!in->can_auth_pin()) {
+    dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl;
+    in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+                   new C_MDS_RetryMessage(mds,m));
+    return false;
+  }
+
+  if (in->is_auth()) {
+    // wait for sync
+    in->add_waiter(CINODE_WAIT_SYNC, 
+                   new C_MDS_RetryMessage(mds, m));
+
+    if (!in->is_presync())
+      inode_sync_start(in);
+  } else {
+    // wait for unsync
+    in->add_waiter(CINODE_WAIT_UNSYNC, 
+                   new C_MDS_RetryMessage(mds, m));
+
+    assert(in->is_syncbyauth());
+    assert(in->is_softasync());
+    
+    if (!in->is_waitonunsync())
+      inode_sync_wait(in);
+  }
+  
+  return false;
+
+ yes:
+  mds->balancer->hit_inode(in, MDS_POP_SOFTWR);
+  mds->balancer->hit_inode(in, MDS_POP_ANY);
+  return true;
+}
+
+
+int MDCache::write_soft_finish(CInode *in)
+{
+  dout(5) << "write_soft_finish " << *in << endl;  //" soft_sync_count " << in->soft_sync_count << endl;
+  return 0;  // do nothing, actually..
+}
+
+
+
+
+
+
+
+
+/* hard locks: owner, mode 
+ */
+
+/*
+bool MDCache::read_hard_try(CInode *in,
+                            Message *m)
+{
+  //dout(5) << "read_hard_try " << *in << endl;
+  
+  if (in->is_auth()) {
+    // auth
+    goto yes;      // fine
+  } else {
+    // replica
+    if (in->is_lockbyauth()) {
+      // locked by auth; wait!
+      dout(7) << "read_hard_try waiting on " << *in << endl;
+      in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m));
+      if (!in->is_waitonunlock())
+        inode_lock_wait(in);
+      return false;
+    } else {
+      // not locked.
+      goto yes;
+    }
+  }
+
+ yes:
+  mds->balancer->hit_inode(in, MDS_POP_HARDRD);
+  mds->balancer->hit_inode(in, MDS_POP_ANY);
+  return true;
+}
+
+
+bool MDCache::write_hard_start(CInode *in, 
+                               Message *m)
+{
+  // if frozen: i can't proceed; only auth can initiate lock
+  if (in->is_frozen()) {
+    dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl;
+    in->add_waiter(CDIR_WAIT_UNFREEZE,
+                   new C_MDS_RetryMessage(mds, m));
+    return false;
+  }
+
+  // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where
+  // the freeze is waiting for our lock to be released)
+
+
+  if (in->is_auth()) {
+    // auth
+    if (in->is_lockbyme()) goto success;
+    if (!in->is_cached_by_anyone()) goto success;
+    
+    // need lock
+    if (!in->can_auth_pin()) {
+      dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl;
+      in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m));
+      return false;
+    }
+    
+    in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m));
+    
+    if (!in->is_prelock())
+      inode_lock_start(in);
+    
+    return false;
+  } else {
+    // replica
+    // fw to auth
+    int auth = in->authority();
+    dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl;
+    assert(auth != mds->get_nodeid());
+    mds->messenger->send_message(m,
+                                 MSG_ADDR_MDS(auth), m->get_dest_port(),
+                                 MDS_PORT_CACHE);
+    return false;
+  }
+
+ success:
+  in->lock_active_count++;
+  dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl;
+  assert(in->lock_active_count > 0);
+
+  mds->balancer->hit_inode(in, MDS_POP_HARDWR);
+  mds->balancer->hit_inode(in, MDS_POP_ANY);
+  return true;
+}
+
+void MDCache::write_hard_finish(CInode *in)
+{
+  in->lock_active_count--;
+  dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl;
+  assert(in->lock_active_count >= 0);
+
+  // release lock?
+  if (in->lock_active_count == 0 &&
+      in->is_lockbyme() &&
+      !g_conf.mdcache_sticky_lock) {
+    dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl;
+    inode_lock_release(in);
+  }
+}
+
+
+void MDCache::inode_lock_start(CInode *in)
+{
+  dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl;
+
+  assert(in->is_auth());
+  assert(!in->is_prelock());
+  assert(!in->is_lockbyme());
+  assert(!in->is_lockbyauth());
+
+  in->lock_waiting_for_ack = in->cached_by;
+  in->dist_state |= CINODE_DIST_PRELOCK;
+  in->get(CINODE_PIN_PRELOCK);
+  in->auth_pin();
+
+  // send messages
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()),
+                                 MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+}
+
+
+void MDCache::inode_lock_release(CInode *in)
+{
+  dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl;
+  
+  assert(in->is_lockbyme());
+  assert(in->is_auth());
+
+  in->dist_state &= ~CINODE_DIST_LOCKBYME;
+
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    mds->messenger->send_message(new MInodeLockRelease(in),
+                                 MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+
+  in->auth_unpin();
+}
+
+void MDCache::inode_lock_wait(CInode *in)
+{
+  dout(5) << "lock_wait on " << *in << endl;
+  assert(!in->is_auth());
+  assert(in->is_lockbyauth());
+  
+  in->dist_state |= CINODE_DIST_WAITONUNLOCK;
+  in->get(CINODE_PIN_WAITONUNLOCK);
+}
+
+
+void MDCache::handle_inode_lock_start(MInodeLockStart *m)
+{
+  // authority is requesting a lock
+  CInode *in = get_inode(m->get_ino());
+  if (!in) {
+    // don't have it anymore!
+    dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
+    mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false),
+                                 MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+    delete m; // done
+    return;
+  }
+  
+  // we shouldn't be authoritative...
+  assert(!in->is_auth());
+  
+  dout(7) << "handle_lock_start " << *in << ", sending ack" << endl;
+  
+  // lock it
+  in->dist_state |= CINODE_DIST_LOCKBYAUTH;
+
+  // sanity check: make sure we know who _is_ authoritative! 
+  assert(m->get_asker() == in->authority());
+  
+  // send ack
+  mds->messenger->send_message(new MInodeLockAck(in->ino()),
+                               MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+                               MDS_PORT_CACHE);
+
+  delete m;  // done
+}
+
+
+void MDCache::handle_inode_lock_ack(MInodeLockAck *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  int from = m->get_source();
+  dout(7) << "handle_lock_ack from " << from << " on " << *in << endl;
+
+  assert(in);
+  assert(in->is_auth());
+  assert(in->dist_state & CINODE_DIST_PRELOCK);
+
+  // remove it from waiting list
+  in->lock_waiting_for_ack.erase(from);
+  
+  if (!m->did_have()) {
+    // erase from cached_by too!
+    in->cached_by_remove(from);
+  }
+
+  if (in->lock_waiting_for_ack.size()) {
+
+    // more coming
+    dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl;
+    
+  } else {
+    
+    // yay!
+    dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl;
+
+    in->dist_state &= ~CINODE_DIST_PRELOCK;
+    in->dist_state |= CINODE_DIST_LOCKBYME;
+    in->put(CINODE_PIN_PRELOCK);
+
+    // do waiters!
+    in->finish_waiting(CINODE_WAIT_LOCK);
+  }
+
+  delete m; // done
+}
+
+
+void MDCache::handle_inode_lock_release(MInodeLockRelease *m)
+{
+  CInode *in = get_inode(m->get_ino());
+
+  if (!in) {
+    dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl;
+    delete m;  // done
+    return;
+  }
+  
+  if (!in->is_lockbyauth()) {
+    dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl;
+    assert(0);   // i should have it, locked, or not have it at all!
+    delete m;  // done
+    return;
+  }
+  
+  dout(7) << "handle_lock_release " << *in << endl;
+  assert(!in->is_auth());
+  
+  // release state
+  in->dist_state &= ~CINODE_DIST_LOCKBYAUTH;
+
+  // waiters?
+  if (in->is_waitonunlock()) {
+    in->put(CINODE_PIN_WAITONUNLOCK);
+    in->dist_state &= ~CINODE_DIST_WAITONUNLOCK;
+    
+    // finish
+    in->finish_waiting(CINODE_WAIT_UNLOCK);
+  }
+  
+  // done
+  delete m;
+}
+*/
+
+
+
+
+
+
+
+
+
+// sync interface
+
+void MDCache::inode_sync_wait(CInode *in)
+{
+  assert(!in->is_auth());
+  
+  int auth = in->authority();
+  dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl;
+  
+  assert(in->is_syncbyauth());
+  assert(!in->is_waitonunsync());
+  
+  in->dist_state |= CINODE_DIST_WAITONUNSYNC;
+  in->get(CINODE_PIN_WAITONUNSYNC);
+  
+  if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) ||
+      (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) {
+    // actually recall; if !sticky, auth will immediately release.
+    dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl;
+    mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino),
+                                 MSG_ADDR_MDS(auth), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+}
+
+
+void MDCache::inode_sync_start(CInode *in)
+{
+  // wait for all replicas
+  dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl;
+
+  assert(in->is_auth());
+  assert(!in->is_presync());
+  assert(!in->is_sync());
+
+  in->sync_waiting_for_ack.clear();
+  in->dist_state |= CINODE_DIST_PRESYNC;
+  in->get(CINODE_PIN_PRESYNC);
+  in->auth_pin();
+  
+  in->sync_replicawantback = false;
+
+  // send messages
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it));
+    mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()),
+                                 MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+
+  // sync clients
+  int last = -1;
+  for (multiset<int>::iterator it = in->get_open_write().begin();
+       it != in->get_open_write().end();
+       it++) {
+    if (*it == last) continue;  last = *it;   // only 1 per client (even if open multiple times)
+    in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
+    mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
+                                 MSG_ADDR_CLIENT(*it), 0,
+                                 MDS_PORT_CACHE);
+  }
+
+}
+
+void MDCache::inode_sync_release(CInode *in)
+{
+  dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl;
+  
+  assert(in->is_syncbyme());
+  assert(in->is_auth());
+
+  in->dist_state &= ~CINODE_DIST_SYNCBYME;
+
+  // release replicas
+  for (set<int>::iterator it = in->cached_by_begin(); 
+       it != in->cached_by_end(); 
+       it++) {
+    mds->messenger->send_message(new MInodeSyncRelease(in),
+                                 MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+  
+  // release writers
+  for (multiset<int>::iterator it = in->get_open_write().begin();
+       it != in->get_open_write().end();
+       it++) {
+    mds->messenger->send_message(new MInodeSyncRelease(in),
+                                 MSG_ADDR_CLIENT(*it), 0,
+                                 MDS_PORT_CACHE);
+  }
+
+  in->auth_unpin();
+}
+
+
+
+
+// messages
+void MDCache::handle_inode_sync_start(MInodeSyncStart *m)
+{
+  // assume asker == authority for now.
+  
+  // authority is requesting a lock
+  CInode *in = get_inode(m->get_ino());
+  if (!in) {
+    // don't have it anymore!
+    dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
+    mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false),
+                                 MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+    delete m; // done
+    return;
+  }
+  
+  dout(10) << "handle_sync_start " << *in << endl;
+
+  // we shouldn't be authoritative...
+  assert(!in->is_auth());
+  
+  // sanity check: make sure we know who _is_ authoritative! 
+  assert(m->get_asker() == in->authority());
+
+  // lock it
+  in->dist_state |= CINODE_DIST_SYNCBYAUTH;
+
+  // open for write by clients?
+  if (in->is_open_write()) {
+    dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl;
+    
+    // sync clients
+    in->sync_waiting_for_ack.clear();
+    for (multiset<int>::iterator it = in->get_open_write().begin();
+         it != in->get_open_write().end();
+         it++) {
+      in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
+      mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
+                                   MSG_ADDR_CLIENT(*it), 0,
+                                   MDS_PORT_CACHE);
+    }
+
+    in->pending_sync_request = m;    
+  } else {
+    // no writers, ack.
+    dout(7) << "handle_sync_start " << *in << ", sending ack" << endl;
+  
+    inode_sync_ack(in, m);
+  }
+}
+
+void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback)
+{
+  dout(7) << "sending inode_sync_ack " << *in << endl;
+    
+  // send ack
+  mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback),
+                               MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+                               MDS_PORT_CACHE);
+
+  delete m;
+}
+
+void MDCache::handle_inode_sync_ack(MInodeSyncAck *m)
+{
+  CInode *in = get_inode(m->get_ino());
+  assert(in);
+
+  dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl;
+
+  if (in->is_auth()) {
+    assert(in->is_presync());
+  } else {
+    assert(in->is_syncbyauth());
+    assert(in->pending_sync_request);
+  }
+
+  // remove it from waiting list
+  in->sync_waiting_for_ack.erase(m->get_source());
+  
+  if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) {
+    // erase from cached_by too!
+    in->cached_by_remove(m->get_source());
+  }
+
+  if (m->replica_wantsback())
+    in->sync_replicawantback = true;
+
+  if (in->sync_waiting_for_ack.size()) {
+
+    // more coming
+    dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl;
+    
+  } else {
+    
+    // yay!
+    dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl;
+
+    if (!in->is_auth()) {
+      // replica, sync ack back to auth
+      assert(in->pending_sync_request);
+      inode_sync_ack(in, in->pending_sync_request, true);
+      in->pending_sync_request = 0;
+      delete m;
+      return;
+    }
+
+    in->dist_state &= ~CINODE_DIST_PRESYNC;
+    in->dist_state |= CINODE_DIST_SYNCBYME;
+    in->put(CINODE_PIN_PRESYNC);
+
+    // do waiters!
+    in->finish_waiting(CINODE_WAIT_SYNC);
+
+
+    // release sync right away?
+    if (in->is_syncbyme()) {
+      if (in->is_freezing()) {
+        dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl;
+        inode_sync_release(in);
+      } 
+      else if (in->sync_replicawantback) {
+        dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl;
+        inode_sync_release(in);
+      }
+      else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) ||
+               (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) {
+        dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl;
+        inode_sync_release(in);
+      } 
+      else {
+        dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl;
+      }
+    } else {
+      dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl;
+    }
+  }
+
+  delete m; // done
+}
+
+
+void MDCache::handle_inode_sync_release(MInodeSyncRelease *m)
+{
+  CInode *in = get_inode(m->get_ino());
+
+  if (!in) {
+    dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl;
+    delete m;  // done
+    return;
+  }
+  
+  if (!in->is_syncbyauth()) {
+    dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl;
+    assert(0);  // this shouldn't happen.
+    delete m;  // done
+    return;
+  }
+  
+  dout(7) << "handle_sync_release " << *in << endl;
+  assert(!in->is_auth());
+  
+  // release state
+  in->dist_state &= ~CINODE_DIST_SYNCBYAUTH;
+
+  // waiters?
+  if (in->is_waitonunsync()) {
+    in->put(CINODE_PIN_WAITONUNSYNC);
+    in->dist_state &= ~CINODE_DIST_WAITONUNSYNC;
+
+    // finish
+    in->finish_waiting(CINODE_WAIT_UNSYNC);
+  }
+
+  // client readers?
+  if (in->is_open_write()) {
+    dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl;
+    for (multiset<int>::iterator it = in->get_open_write().begin();
+         it != in->get_open_write().end();
+         it++) {
+      mds->messenger->send_message(new MInodeSyncRelease(in),
+                                   MSG_ADDR_CLIENT(*it), 0,
+                                   MDS_PORT_CACHE);
+    }
+  }
+
+  
+  // done
+  delete m;
+}
+
+
+void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m)
+{
+  CInode *in = get_inode(m->get_ino());
+
+  if (!in) {
+    dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl;
+    assert(0); // shouldn't happen
+    delete m;  // done
+    return;
+  }
+  if(!in->is_auth()) {
+    do_ino_proxy(in, m);
+    return;
+  }
+  
+  if (in->is_syncbyme()) {
+    dout(7) << "handle_sync_recall " << *in << ", releasing" << endl;
+    inode_sync_release(in);
+  }
+  else if (in->is_presync()) {
+    dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl;
+    in->sync_replicawantback = true;
+  }
+  else {
+    dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl;
+  }
+  
+  // done
+  delete m;
+}
+
+
+
+
+
+
+
+
+
+
+// DIR SYNC
+
+/*
+
+ dir sync
+
+ - this are used when a directory is HASHED only.  namely,
+   - to stat the dir inode we need an accurate directory size  (????)
+   - for a readdir 
+
+*/
+
+void MDCache::dir_sync_start(CDir *dir)
+{
+  // wait for all replicas
+  dout(5) << "sync_start on " << *dir << endl;
+
+  assert(dir->is_hashed());
+  assert(dir->is_auth());
+  assert(!dir->is_presync());
+  assert(!dir->is_sync());
+
+  dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set();
+  dir->state_set(CDIR_STATE_PRESYNC);
+  dir->auth_pin();
+  
+  //dir->sync_replicawantback = false;
+
+  // send messages
+  for (set<int>::iterator it = dir->sync_waiting_for_ack.begin();
+       it != dir->sync_waiting_for_ack.end();
+       it++) {
+    mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()),
+                                 MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+                                 MDS_PORT_CACHE);
+  }
+}
+
+
+void MDCache::dir_sync_release(CDir *dir)
+{
+
+
+}
+
+void MDCache::dir_sync_wait(CDir *dir)
+{
+
+}
+
+
+void handle_dir_sync_start(MDirSyncStart *m)
+{
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/messages/MAnchorReply.h b/branches/sage/cephmds2/messages/MAnchorReply.h
new file mode 100644
index 0000000000000..0186118f53260
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MAnchorReply.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MANCHORREPLY_H
+#define __MANCHORREPLY_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "mds/AnchorTable.h"
+
+#include "MAnchorRequest.h"
+
+
+class MAnchorReply : public Message {
+  int op;
+  inodeno_t ino;
+  vector<Anchor*> trace;
+
+ public:
+  MAnchorReply() {}
+  MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) {
+    this->op = req->get_op();
+    this->ino = req->get_ino();
+  }
+  ~MAnchorReply() {
+    for (unsigned i=0; i<trace.size(); i++) delete trace[i];
+  }
+  virtual char *get_type_name() { return "arep"; }
+
+  void set_trace(vector<Anchor*>& trace) { this->trace = trace; }
+
+  int get_op() { return op; }
+  inodeno_t get_ino() { return ino; }
+  vector<Anchor*>& get_trace() { return trace; }
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(op), (char*)&op);
+    off += sizeof(op);
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    int n;
+    payload.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      Anchor *a = new Anchor;
+      a->_decode(payload, off);
+      trace.push_back(a);
+    }
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&op, sizeof(op));
+    payload.append((char*)&ino, sizeof(ino));
+    int n = trace.size();
+    payload.append((char*)&n, sizeof(int));
+    for (int i=0; i<n; i++) 
+      trace[i]->_encode(payload);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MAnchorRequest.h b/branches/sage/cephmds2/messages/MAnchorRequest.h
new file mode 100644
index 0000000000000..2a2d0088978b4
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MAnchorRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MANCHORREQUEST_H
+#define __MANCHORREQUEST_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "mds/AnchorTable.h"
+
+#define ANCHOR_OP_CREATE   1
+#define ANCHOR_OP_DESTROY  2
+#define ANCHOR_OP_LOOKUP   3
+#define ANCHOR_OP_UPDATE   4
+
+class MAnchorRequest : public Message {
+  int op;
+  inodeno_t ino;
+  vector<Anchor*> trace;
+
+ public:
+  MAnchorRequest() {}
+  MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) {
+    this->op = op;
+    this->ino = ino;
+  }
+  ~MAnchorRequest() {
+    for (unsigned i=0; i<trace.size(); i++) delete trace[i];
+  }
+  virtual char *get_type_name() { return "areq"; }
+
+  void set_trace(vector<Anchor*>& trace) { this->trace = trace; }
+
+  int get_op() { return op; }
+  inodeno_t get_ino() { return ino; }
+  vector<Anchor*>& get_trace() { return trace; }
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(op), (char*)&op);
+    off += sizeof(op);
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    int n;
+    payload.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      Anchor *a = new Anchor;
+      a->_decode(payload, off);
+      trace.push_back(a);
+    }
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&op, sizeof(op));
+    payload.append((char*)&ino, sizeof(ino));
+    int n = trace.size();
+    payload.append((char*)&n, sizeof(int));
+    for (int i=0; i<n; i++) 
+      trace[i]->_encode(payload);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MCacheExpire.h b/branches/sage/cephmds2/messages/MCacheExpire.h
new file mode 100644
index 0000000000000..11d941f5131d1
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MCacheExpire.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCACHEEXPIRE_H
+#define __MCACHEEXPIRE_H
+
+
+class MCacheExpire : public Message {
+  map<inodeno_t, int> inodes;
+  map<inodeno_t, int> dirs;
+  int from;
+
+ public:
+  map<inodeno_t,int>& get_inodes() { return inodes; }
+  map<inodeno_t,int>& get_dirs() { return dirs; }
+  int get_from() { return from; }
+
+  MCacheExpire() {}
+  MCacheExpire(int from) : Message(MSG_MDS_CACHEEXPIRE) {
+    this->from = from;
+  }
+  virtual char *get_type_name() { return "CEx";}
+  
+  void add_inode(inodeno_t ino, int nonce) {
+    inodes.insert(pair<inodeno_t,int>(ino,nonce));
+  }
+  void add_dir(inodeno_t ino, int nonce) {
+    dirs.insert(pair<inodeno_t,int>(ino,nonce));
+  }
+
+  virtual void decode_payload(crope& s, int& off) {
+    int n;
+
+    s.copy(off, sizeof(from), (char*)&from);
+    off += sizeof(from);
+
+    // inodes
+    s.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      inodeno_t ino;
+      int nonce;
+      s.copy(off, sizeof(ino), (char*)&ino);
+      off += sizeof(ino);
+      s.copy(off, sizeof(int), (char*)&nonce);
+      off += sizeof(int);
+      inodes.insert(pair<inodeno_t, int>(ino,nonce));
+    }
+
+    // dirs
+    s.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      inodeno_t ino;
+      int nonce;
+      s.copy(off, sizeof(ino), (char*)&ino);
+      off += sizeof(ino);
+      s.copy(off, sizeof(int), (char*)&nonce);
+      off += sizeof(int);
+      dirs.insert(pair<inodeno_t, int>(ino,nonce));
+    }
+  }
+  
+  void rope_map(crope& s, map<inodeno_t,int>& mp) {
+    int n = mp.size();
+    s.append((char*)&n, sizeof(int));
+    for (map<inodeno_t,int>::iterator it = mp.begin();
+         it != mp.end(); 
+         it++) {
+      inodeno_t ino = it->first;
+      int nonce = it->second;
+      s.append((char*)&ino, sizeof(ino));
+      s.append((char*)&nonce, sizeof(nonce));
+    }    
+  }
+
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&from, sizeof(from));
+    rope_map(s, inodes);
+    rope_map(s, dirs);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientFileCaps.h b/branches/sage/cephmds2/messages/MClientFileCaps.h
new file mode 100644
index 0000000000000..7fde047b02655
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientFileCaps.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTFILECAPS_H
+#define __MCLIENTFILECAPS_H
+
+#define CLIENT_FILECAP_RELEASE 1  // mds closed the cap
+#define CLIENT_FILECAP_STALE   2  // mds has exported the cap
+#define CLIENT_FILECAP_REAP    3  // mds has imported the cap from get_mds()
+
+class MClientFileCaps : public Message {
+ public:
+  static const int FILECAP_RELEASE = 1;
+  static const int FILECAP_STALE = 2;
+  static const int FILECAP_REAP = 3;
+
+
+ private:
+  inode_t   inode;
+  int       caps;
+  long      seq;
+  int       wanted;
+  //int       client;
+  
+  int       special;   // stale || reap;  in conjunction w/ mds value
+  int       mds;
+
+ public:
+  inodeno_t get_ino() { return inode.ino; }
+  inode_t&  get_inode() { return inode; }
+  int       get_caps() { return caps; }
+  int       get_wanted() { return wanted; }
+  long      get_seq() { return seq; }
+  //int       get_client() { return client; }
+
+  // for cap migration
+  int       get_mds() { return mds; }
+  int       get_special() { return special; }
+
+  //void set_client(int c) { client = c; }
+  void set_caps(int c) { caps = c; }
+  void set_wanted(int w) { wanted = w; }
+
+  void set_mds(int m) { mds = m; }
+  void set_special(int s) { special = s; }
+
+  MClientFileCaps() {}
+  MClientFileCaps(inode_t& inode,
+                  long seq,
+                  int caps,
+                  int wanted,
+                  int special=0,
+                  int mds=0) :
+    Message(MSG_CLIENT_FILECAPS) {
+    this->inode = inode;
+    this->seq = seq;
+    this->caps = caps;
+    this->wanted = wanted;
+    this->special = special;
+    this->mds = mds;
+  }
+  virtual char *get_type_name() { return "Cfcap";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(seq), (char*)&seq);
+    off += sizeof(seq);
+    s.copy(off, sizeof(inode), (char*)&inode);
+    off += sizeof(inode);
+    s.copy(off, sizeof(caps), (char*)&caps);
+    off += sizeof(caps);
+    s.copy(off, sizeof(wanted), (char*)&wanted);
+    off += sizeof(wanted);
+    //s.copy(off, sizeof(client), (char*)&client);
+    //off += sizeof(client);
+    s.copy(off, sizeof(mds), (char*)&mds);
+    off += sizeof(mds);
+    s.copy(off, sizeof(special), (char*)&special);
+    off += sizeof(special);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&seq, sizeof(seq));
+    s.append((char*)&inode, sizeof(inode));
+    s.append((char*)&caps, sizeof(caps));
+    s.append((char*)&wanted, sizeof(wanted));
+    //s.append((char*)&client, sizeof(client));
+    s.append((char*)&mds,sizeof(mds));
+    s.append((char*)&special,sizeof(special));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h b/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h
new file mode 100644
index 0000000000000..e9083f6abc575
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTINODEAUTHUPDATE_H
+#define __MCLIENTINODEAUTHUPDATE_H
+
+class MClientInodeAuthUpdate : public Message {
+  inodeno_t ino;
+  int       newauth;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  int       get_auth() { return newauth; }
+
+  MClientInodeAuthUpdate() {}
+  MClientInodeAuthUpdate(inodeno_t ino, int newauth) :
+    Message(MSG_CLIENT_INODEAUTHUPDATE) {
+    this->ino = ino;
+    this->newauth = newauth;
+  }
+  virtual char *get_type_name() { return "Ciau";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    s.copy(off, sizeof(newauth), (char*)&newauth);
+    off += sizeof(newauth);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino,sizeof(ino));
+    s.append((char*)&newauth,sizeof(newauth));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientMount.h b/branches/sage/cephmds2/messages/MClientMount.h
new file mode 100644
index 0000000000000..fd253baed0f24
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientMount.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTMOUNT_H
+#define __MCLIENTMOUNT_H
+
+#include "msg/Message.h"
+
+class MClientMount : public Message {
+  long pcid;
+  int mkfs;
+
+ public:
+  MClientMount() : Message(MSG_CLIENT_MOUNT) { 
+    pcid = 0;
+    mkfs = 0;
+  }
+
+  void set_mkfs(int m) { mkfs = m; }
+  int get_mkfs() { return mkfs; }
+
+  void set_pcid(long pcid) { this->pcid = pcid; }
+  long get_pcid() { return pcid; }
+
+  char *get_type_name() { return "Cmnt"; }
+
+  virtual void decode_payload(crope& s, int& off) {  
+    s.copy(off, sizeof(pcid), (char*)&pcid);
+    off += sizeof(pcid);
+    s.copy(off, sizeof(mkfs), (char*)&mkfs);
+    off += sizeof(mkfs);
+  }
+  virtual void encode_payload(crope& s) {  
+    s.append((char*)&pcid, sizeof(pcid));
+    s.append((char*)&mkfs, sizeof(mkfs));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientMountAck.h b/branches/sage/cephmds2/messages/MClientMountAck.h
new file mode 100644
index 0000000000000..6b1b7cb2a901b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientMountAck.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTMOUNTACK_H
+#define __MCLIENTMOUNTACK_H
+
+#include "msg/Message.h"
+#include "MClientMount.h"
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+
+
+class MClientMountAck : public Message {
+  long pcid;
+  bufferlist osd_map_state;
+  bufferlist mds_map_state;
+
+ public:
+  MClientMountAck() {}
+  MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) { 
+    this->pcid = mnt->get_pcid();
+    mdsmap->encode( mds_map_state );
+    osdmap->encode( osd_map_state );
+  }
+  
+  bufferlist& get_mds_map_state() { return mds_map_state; }
+  bufferlist& get_osd_map_state() { return osd_map_state; }
+
+  void set_pcid(long pcid) { this->pcid = pcid; }
+  long get_pcid() { return pcid; }
+
+  char *get_type_name() { return "CmntA"; }
+
+  virtual void decode_payload() {  
+    int off = 0;
+    payload.copy(off, sizeof(pcid), (char*)&pcid);
+    off += sizeof(pcid);
+    ::_decode( mds_map_state, payload, off);
+    ::_decode( osd_map_state, payload, off);
+  }
+  virtual void encode_payload() {  
+    payload.append((char*)&pcid, sizeof(pcid));
+    ::_encode( mds_map_state, payload );
+    ::_encode( osd_map_state, payload );
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientReply.h b/branches/sage/cephmds2/messages/MClientReply.h
new file mode 100644
index 0000000000000..6206b909b0c05
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientReply.h
@@ -0,0 +1,302 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTREPLY_H
+#define __MCLIENTREPLY_H
+
+#include "include/types.h"
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "mds/CDir.h"
+#include "mds/CDentry.h"
+
+#include <vector>
+using namespace std;
+
+class CInode;
+
+/***
+ *
+ * MClientReply - container message for MDS reply to a client's MClientRequest
+ *
+ * key fields:
+ *  long tid - transaction id, so the client can match up with pending request
+ *  int result - error code, or fh if it was open
+ *
+ * for most requests:
+ *  trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever
+ *  the operation referred to, so that the client can update it's info about what
+ *  metadata lives on what MDS.
+ *
+ * for readdir replies:
+ *  dir_contents is a vector c_inode_info*'s.  
+ * 
+ * that's mostly it, i think!
+ *
+ */
+
+class InodeStat {
+
+ public:
+  inode_t inode;
+  string  symlink;   // symlink content (if symlink)
+
+
+  // mds distribution hints
+  int      dir_auth;
+  bool     hashed, replicated;
+  bool     spec_defined;
+  set<int> dist;    // where am i replicated?
+
+ public:
+  InodeStat() {}
+  InodeStat(CInode *in, int whoami) :
+    inode(in->inode)
+  {
+    // inode.mask
+    inode.mask = INODE_MASK_BASE;
+    if (in->filelock.can_read(in->is_auth()))
+      inode.mask |= INODE_MASK_PERM;
+    if (in->hardlock.can_read(in->is_auth()))
+      inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME;      // fixme when we separate this out.
+    
+    // symlink content?
+    if (in->is_symlink()) 
+      symlink = in->symlink;
+      
+    // replicated where?
+    if (in->dir && in->dir->is_auth()) {
+      spec_defined = true;
+      in->dir->get_dist_spec(this->dist, whoami);
+    } else 
+      spec_defined = false;
+
+    if (in->dir)
+      dir_auth = in->dir->get_dir_auth();
+    else
+      dir_auth = -1;
+
+    // dir info
+    hashed = (in->dir && in->dir->is_hashed());   // FIXME not quite right.
+    replicated = (in->dir && in->dir->is_rep());
+  }
+  
+  void _encode(bufferlist &bl) {
+    bl.append((char*)&inode, sizeof(inode));
+    bl.append((char*)&spec_defined, sizeof(spec_defined));
+    bl.append((char*)&dir_auth, sizeof(dir_auth));
+    bl.append((char*)&hashed, sizeof(hashed));
+    bl.append((char*)&replicated, sizeof(replicated));
+
+    ::_encode(symlink, bl);
+    ::_encode(dist, bl);    // distn
+  }
+  
+  void _decode(bufferlist &bl, int& off) {
+    bl.copy(off, sizeof(inode), (char*)&inode);
+    off += sizeof(inode);
+    bl.copy(off, sizeof(spec_defined), (char*)&spec_defined);
+    off += sizeof(spec_defined);
+    bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
+    off += sizeof(dir_auth);
+    bl.copy(off, sizeof(hashed), (char*)&hashed);
+    off += sizeof(hashed);
+    bl.copy(off, sizeof(replicated), (char*)&replicated);
+    off += sizeof(replicated);
+
+    ::_decode(symlink, bl, off);
+    ::_decode(dist, bl, off);
+  }
+};
+
+
+typedef struct {
+  long pcid;
+  long tid;
+  int op;
+  int result;  // error code
+  unsigned char file_caps;  // for open
+  long          file_caps_seq;
+  __uint64_t file_data_version;  // for client buffercache consistency
+
+  int _num_trace_in;
+  int _dir_size;
+} MClientReply_st;
+
+class MClientReply : public Message {
+  // reply data
+  MClientReply_st st;
+ 
+  string path;
+  list<InodeStat*> trace_in;
+  list<string>     trace_dn;
+
+  list<InodeStat*> dir_in;
+  list<string>     dir_dn;
+
+ public:
+  void set_pcid(long pcid) { this->st.pcid = pcid; }
+  long get_pcid() { return st.pcid; }
+
+  long get_tid() { return st.tid; }
+  int get_op() { return st.op; }
+
+  int get_result() { return st.result; }
+  const string& get_path() { return path; }
+
+  inodeno_t get_ino() { return trace_in.back()->inode.ino; }
+  const inode_t& get_inode() { return trace_in.back()->inode; }
+
+  const list<InodeStat*>& get_trace_in() { return trace_in; }
+  const list<string>&     get_trace_dn() { return trace_dn; }
+
+  const list<InodeStat*>& get_dir_in() { return dir_in; }
+  const list<string>&     get_dir_dn() { return dir_dn; }
+
+  unsigned char get_file_caps() { return st.file_caps; }
+  long get_file_caps_seq() { return st.file_caps_seq; }
+  __uint64_t get_file_data_version() { return st.file_data_version; }
+  
+  void set_result(int r) { st.result = r; }
+  void set_file_caps(unsigned char c) { st.file_caps = c; }
+  void set_file_caps_seq(long s) { st.file_caps_seq = s; }
+  void set_file_data_version(__uint64_t v) { st.file_data_version = v; }
+
+  MClientReply() {};
+  MClientReply(MClientRequest *req, int result = 0) : 
+    Message(MSG_CLIENT_REPLY) {
+    memset(&st, 0, sizeof(st));
+    this->st.pcid = req->get_pcid();    // match up procedure call id!!!
+    this->st.tid = req->get_tid();
+    this->st.op = req->get_op();
+    this->path = req->get_path();
+
+    this->st.result = result;
+
+    st._dir_size = 0;
+    st._num_trace_in = 0;
+  }
+  virtual ~MClientReply() {
+    list<InodeStat*>::iterator it;
+    
+    for (it = trace_in.begin(); it != trace_in.end(); ++it) 
+      delete *it;
+    for (it = dir_in.begin(); it != dir_in.end(); ++it) 
+      delete *it;
+  }
+  virtual char *get_type_name() { return "creply"; }
+
+
+  // serialization
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+
+    _decode(path, payload, off);
+
+    for (int i=0; i<st._num_trace_in; ++i) {
+      if (i) {
+        string ref_dn;
+        ::_decode(ref_dn, payload, off);
+        trace_dn.push_back(ref_dn);
+      }        
+      InodeStat *ci = new InodeStat;
+      ci->_decode(payload, off);
+      trace_in.push_back(ci);
+    }
+
+    for (int i=0; i<st._dir_size; ++i) {
+      InodeStat *ci = new InodeStat;
+      ci->_decode(payload, off);
+      dir_in.push_back(ci);
+      string dn;
+      ::_decode(dn, payload, off);
+      dir_dn.push_back(dn);
+    }
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&st, sizeof(st));
+    _encode(path, payload);
+
+    // trace
+    list<string>::iterator pdn = trace_dn.begin();
+    list<InodeStat*>::iterator pin;
+    for (pin = trace_in.begin();
+         pin != trace_in.end();
+         ++pin) {
+      if (pin != trace_in.begin()) {
+        ::_encode(*pdn, payload);
+        ++pdn;
+      }
+      (*pin)->_encode(payload);
+    }
+
+    // dir contents
+    pdn = dir_dn.begin();
+    for (pin = dir_in.begin();
+         pin != dir_in.end();
+         ++pin, ++pdn) {
+      (*pin)->_encode(payload);
+      ::_encode(*pdn, payload);
+    }
+  }
+
+  // builders
+  /*
+  void add_dir_item(string& dn, InodeStat *in) {
+    dir_dn.push_back(dn);
+    dir_in.push_back(in);
+    ++st._dir_size;
+    }*/
+  void take_dir_items(list<InodeStat*>& inls,
+                      list<string>& dnls,
+                      int num) {
+    dir_in.swap(inls);
+    dir_dn.swap(dnls);
+    st._dir_size = num;
+  }
+  void copy_dir_items(const list<InodeStat*>& inls,
+                      const list<string>& dnls) {
+    list<string>::const_iterator pdn = dnls.begin();
+    list<InodeStat*>::const_iterator pin = inls.begin();
+    while (pin != inls.end()) {
+      // copy!
+      InodeStat *i = new InodeStat;
+      *i = **pin;
+      dir_in.push_back(i);
+      dir_dn.push_back(*pdn);
+      ++pin;
+      ++pdn;
+      ++st._dir_size;
+    }
+  }
+
+  void set_trace_dist(CInode *in, int whoami) {
+    st._num_trace_in = 0;
+    while (in) {
+      // add this inode to trace, along with referring dentry name
+      if (in->get_parent_dn()) 
+        trace_dn.push_front(in->get_parent_dn()->get_name());
+      trace_in.push_front(new InodeStat(in, whoami));
+      ++st._num_trace_in;
+      
+      in = in->get_parent_inode();
+    }
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MClientRequest.h b/branches/sage/cephmds2/messages/MClientRequest.h
new file mode 100644
index 0000000000000..dff2af23deb5f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MClientRequest.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MCLIENTREQUEST_H
+#define __MCLIENTREQUEST_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "include/filepath.h"
+#include "mds/MDS.h"
+
+/**
+ *
+ * MClientRequest - container for a client METADATA request.  created/sent by clients.  
+ *    can be forwarded around between MDS's.
+ *
+ *   int client - the originating client
+ *   long pcid  - procedure call id, used to match request+response.
+ *   long tid   - transaction id, unique among requests for that client.  probably just a counter!
+ *                -> the MDS passes the Request to the Reply constructor, so this always matches.
+ *  
+ *   int op - the metadata op code.  MDS_OP_RENAME, etc.
+ *   int caller_uid, _gid - guess
+ * 
+ * arguments:  one or more of these are defined, depending on the metadata op:
+ *   inodeno  ino  - used by close(), along with fh.  not strictly necessary except MDS is currently coded lame.
+ *   filepath path - main file argument (almost everything)
+ *   string   sarg - string argument (if a second arg is needed, e.g. rename, symlink)
+ *   int  iarg     - int arg... file mode for open, fh for close, mode for mkdir, etc.
+ *   int  iarg2    - second int arg... gid for chown (iarg is uid)
+ *   time_t targ, targ2  - time args, used by utime
+ *
+ * That's basically it!
+ *  
+ */
+
+
+typedef struct {
+  long tid;
+  int client;
+  int op;
+  
+  entity_inst_t client_inst;
+
+  int caller_uid, caller_gid;
+  inodeno_t ino;
+
+  int    iarg, iarg2;
+  time_t targ, targ2;
+
+  inodeno_t  mds_wants_replica_in_dirino;
+
+  size_t sizearg;
+} MClientRequest_st;
+
+
+class MClientRequest : public Message {
+  MClientRequest_st st;
+  filepath path;
+  string sarg;
+  string sarg2;
+
+
+ public:
+  MClientRequest() {}
+  MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) {
+    memset(&st, 0, sizeof(st));
+    this->st.op = op;
+    this->st.client = client;
+    this->st.iarg = 0;
+  }
+  virtual char *get_type_name() { return "creq"; }
+
+  // keep a pcid (procedure call id) to match up request+reply
+  //void set_pcid(long pcid) { this->st.pcid = pcid; }
+  //long get_pcid() { return st.pcid; }
+
+  // normal fields
+  void set_tid(long t) { st.tid = t; }
+  void set_path(string& p) { path.set_path(p); }
+  void set_path(const char *p) { path.set_path(p); }
+  void set_path(const filepath& fp) { path = fp; }
+  void set_caller_uid(int u) { st.caller_uid = u; }
+  void set_caller_gid(int g) { st.caller_gid = g; }
+  void set_ino(inodeno_t ino) { st.ino = ino; }
+  void set_iarg(int i) { st.iarg = i; }
+  void set_iarg2(int i) { st.iarg2 = i; }
+  void set_targ(time_t& t) { st.targ = t; }
+  void set_targ2(time_t& t) { st.targ2 = t; }
+  void set_sarg(string& arg) { this->sarg = arg; }
+  void set_sarg(const char *arg) { this->sarg = arg; }
+  void set_sarg2(string& arg) { this->sarg2 = arg; }
+  void set_sizearg(size_t s) { st.sizearg = s; }
+  void set_mds_wants_replica_in_dirino(inodeno_t dirino) { 
+    st.mds_wants_replica_in_dirino = dirino; }
+  
+  void set_client_inst(const entity_inst_t& i) { st.client_inst = i; }
+  const entity_inst_t& get_client_inst() { return st.client_inst; }
+
+  int get_client() { return st.client; }
+  long get_tid() { return st.tid; }
+  int get_op() { return st.op; }
+  int get_caller_uid() { return st.caller_uid; }
+  int get_caller_gid() { return st.caller_gid; }
+  inodeno_t get_ino() { return st.ino; }
+  string& get_path() { return path.get_path(); }
+  filepath& get_filepath() { return path; }
+  int get_iarg() { return st.iarg; }
+  int get_iarg2() { return st.iarg2; }
+  time_t get_targ() { return st.targ; }
+  time_t get_targ2() { return st.targ2; }
+  string& get_sarg() { return sarg; }
+  string& get_sarg2() { return sarg2; }
+  size_t get_sizearg() { return st.sizearg; }
+  inodeno_t get_mds_wants_replica_in_dirino() { 
+    return st.mds_wants_replica_in_dirino; }
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+    path._decode(payload, off);
+    _decode(sarg, payload, off);
+    _decode(sarg2, payload, off);
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&st, sizeof(st));
+    path._encode(payload);
+    _encode(sarg, payload);
+    _encode(sarg2, payload);
+  }
+
+  void print(ostream& out) {
+    out << "clientreq(client" << get_client() 
+	<< "." << get_tid() 
+      //<< ".pcid=" << get_pcid() 
+	<< ":";
+    switch(get_op()) {
+    case MDS_OP_STAT: 
+      out << "stat"; break;
+    case MDS_OP_LSTAT: 
+      out << "lstat"; break;
+    case MDS_OP_UTIME: 
+      out << "utime"; break;
+    case MDS_OP_CHMOD: 
+      out << "chmod"; break;
+    case MDS_OP_CHOWN: 
+      out << "chown"; break;
+      
+    case MDS_OP_READDIR: 
+      out << "readdir"; break;
+    case MDS_OP_MKNOD: 
+      out << "mknod"; break;
+    case MDS_OP_LINK: 
+      out << "link"; break;
+    case MDS_OP_UNLINK:
+      out << "unlink"; break;
+    case MDS_OP_RENAME:
+      out << "rename"; break;
+      
+    case MDS_OP_MKDIR: 
+      out << "mkdir"; break;
+    case MDS_OP_RMDIR: 
+      out << "rmdir"; break;
+    case MDS_OP_SYMLINK: 
+      out << "symlink"; break;
+      
+    case MDS_OP_OPEN: 
+      out << "open"; break;
+    case MDS_OP_TRUNCATE: 
+      out << "truncate"; break;
+    case MDS_OP_FSYNC: 
+      out << "fsync"; break;
+    case MDS_OP_RELEASE: 
+      out << "release"; break;
+    default: 
+      out << "unknown=" << get_op();
+    }
+    if (get_path().length()) 
+      out << "=" << get_path();
+    if (get_sarg().length())
+      out << " " << get_sarg();
+    out << ")";
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDentryUnlink.h b/branches/sage/cephmds2/messages/MDentryUnlink.h
new file mode 100644
index 0000000000000..ec1503eeadf00
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDentryUnlink.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDENTRYUNLINK_H
+#define __MDENTRYUNLINK_H
+
+class MDentryUnlink : public Message {
+  inodeno_t dirino;
+  string dn;
+
+ public:
+  inodeno_t get_dirino() { return dirino; }
+  string& get_dn() { return dn; }
+
+  MDentryUnlink() {}
+  MDentryUnlink(inodeno_t dirino, string& dn) :
+    Message(MSG_MDS_DENTRYUNLINK) {
+    this->dirino = dirino;
+    this->dn = dn;
+  }
+  virtual char *get_type_name() { return "Dun";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(dirino), (char*)&dirino);
+    off += sizeof(dirino);
+    _unrope(dn, s, off);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&dirino,sizeof(dirino));
+    _rope(dn, s);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDirExpire.h b/branches/sage/cephmds2/messages/MDirExpire.h
new file mode 100644
index 0000000000000..a81de3d538365
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDirExpire.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDIREXPIRE_H
+#define __MDIREXPIRE_H
+
+typedef struct {
+  inodeno_t ino;
+  int nonce;
+  int from;
+} MDirExpire_st;
+
+class MDirExpire : public Message {
+  MDirExpire_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_from() { return st.from; }
+  int get_nonce() { return st.nonce; }
+
+  MDirExpire() {}
+  MDirExpire(inodeno_t ino, int from, int nonce) :
+    Message(MSG_MDS_DIREXPIRE) {
+    st.ino = ino;
+    st.from = from;
+    st.nonce = nonce;
+  }
+  virtual char *get_type_name() { return "DirEx";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDirExpireReq.h b/branches/sage/cephmds2/messages/MDirExpireReq.h
new file mode 100644
index 0000000000000..604a55265c723
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDirExpireReq.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDIREXPIREREQ_H
+#define __MDIREXPIREREQ_H
+
+typedef struct {
+  inodeno_t ino;
+  int nonce;
+  int from;
+} MDirExpireReq_st;
+
+class MDirExpire : public Message {
+  MDirExpireReq_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_from() { return st.from; }
+  int get_nonce() { return st.nonce; }
+
+  MDirExpire() {}
+  MDirExpire(inodeno_t ino, int from, int nonce) :
+    Message(MSG_MDS_DIREXPIREREQ) {
+    st.ino = ino;
+    st.from = from;
+    st.nonce = nonce;
+  }
+  virtual char *get_type_name() { return "DirExR";}
+  
+  virtual void decode_payload(crope& s) {
+    s.copy(0, sizeof(st), (char*)&st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDirUpdate.h b/branches/sage/cephmds2/messages/MDirUpdate.h
new file mode 100644
index 0000000000000..9bac721654c22
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDirUpdate.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDIRUPDATE_H
+#define __MDIRUPDATE_H
+
+#include "msg/Message.h"
+
+typedef struct {
+  inodeno_t ino;
+  int dir_rep;
+  int discover;
+} MDirUpdate_st;
+
+class MDirUpdate : public Message {
+  MDirUpdate_st st;
+  set<int> dir_rep_by;
+  string path;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_dir_rep() { return st.dir_rep; }
+  set<int>& get_dir_rep_by() { return dir_rep_by; } 
+  bool should_discover() { return st.discover > 0; }
+  string& get_path() { return path; }
+
+  void tried_discover() {
+    if (st.discover) st.discover--;
+  }
+
+  MDirUpdate() {}
+  MDirUpdate(inodeno_t ino,
+             int dir_rep,
+             set<int>& dir_rep_by,
+             string& path,
+             bool discover = false) :
+    Message(MSG_MDS_DIRUPDATE) {
+    this->st.ino = ino;
+    this->st.dir_rep = dir_rep;
+    this->dir_rep_by = dir_rep_by;
+    if (discover) this->st.discover = 5;
+    this->path = path;
+  }
+  virtual char *get_type_name() { return "dup"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+    _unrope(dir_rep_by, s, off);
+    _unrope(path, s, off);
+  }
+
+  virtual void encode_payload(crope& r) {
+    r.append((char*)&st, sizeof(st));
+    _rope(dir_rep_by, r);
+    _rope(path, r);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDiscover.h b/branches/sage/cephmds2/messages/MDiscover.h
new file mode 100644
index 0000000000000..d207ab28cc143
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDiscover.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDISCOVER_H
+#define __MDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CDir.h"
+#include "include/filepath.h"
+
+#include <vector>
+#include <string>
+using namespace std;
+
+
+class MDiscover : public Message {
+  int             asker;
+  inodeno_t       base_ino;          // 0 -> none, want root
+  bool            want_base_dir;
+  bool            want_root_inode;
+  
+  filepath        want;   // ... [/]need/this/stuff
+
+ public:
+  int       get_asker() { return asker; }
+  inodeno_t get_base_ino() { return base_ino; }
+  filepath& get_want() { return want; }
+  const string&   get_dentry(int n) { return want[n]; }
+  bool      wants_base_dir() { return want_base_dir; }
+
+  MDiscover() { }
+  MDiscover(int asker, 
+            inodeno_t base_ino,
+            filepath& want,
+            bool want_base_dir = true,
+            bool want_root_inode = false) :
+    Message(MSG_MDS_DISCOVER) {
+    this->asker = asker;
+    this->base_ino = base_ino;
+    this->want = want;
+    this->want_base_dir = want_base_dir;
+    this->want_root_inode = want_root_inode;
+  }
+  virtual char *get_type_name() { return "Dis"; }
+
+  virtual void decode_payload(crope& r, int& off) {
+    r.copy(off, sizeof(asker), (char*)&asker);
+    off += sizeof(asker);
+    r.copy(off, sizeof(base_ino), (char*)&base_ino);
+    off += sizeof(base_ino);
+    r.copy(off, sizeof(bool), (char*)&want_base_dir);
+    off += sizeof(bool);
+    want._unrope(r, off);
+  }
+  virtual void encode_payload(crope& r) {
+    r.append((char*)&asker, sizeof(asker));
+    r.append((char*)&base_ino, sizeof(base_ino));
+    r.append((char*)&want_base_dir, sizeof(want_base_dir));
+    want._rope(r);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MDiscoverReply.h b/branches/sage/cephmds2/messages/MDiscoverReply.h
new file mode 100644
index 0000000000000..78e5d001086ec
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MDiscoverReply.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MDISCOVERREPLY_H
+#define __MDISCOVERREPLY_H
+
+#include "msg/Message.h"
+#include "mds/CDir.h"
+#include "mds/CInode.h"
+#include "include/filepath.h"
+
+#include <vector>
+#include <string>
+using namespace std;
+
+#define max(a,b)  ((a)>(b) ? (a):(b))
+
+
+/**
+ * MDiscoverReply - return new replicas (of inodes, dirs, dentries)
+ *
+ * we group returned items by (dir, dentry, inode).  each
+ * item in each set shares an index (it's "depth").
+ *
+ * we can start and end with any type.
+ *   no_base_dir    = true if the first group has an inode but no dir
+ *   no_base_dentry = true if the first group has an inode but no dentry
+ * they are false if there is no returned data, ie the first group is empty.
+ *
+ * we also return errors:
+ *   error_flag_dn(string) - the specified dentry dne
+ *   error_flag_dir        - the last item wasn't a dir, so we couldn't continue.
+ *
+ * depth() gives us the number of depth units/indices for which we have 
+ * information.  this INCLUDES those for which we have errors but no data.
+ *
+ * see MDCache::handle_discover, handle_discover_reply.
+ *
+  
+ old crap, maybe not accurate:
+
+  // dir [ + ... ]                 : discover want_base_dir=true
+  
+  // dentry [ + inode [ + ... ] ]  : discover want_base_dir=false
+  //                                 no_base_dir=true
+  //  -> we only exclude inode if dentry is null+xlock
+
+  // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino,
+  //                                 no_base_dir=no_base_dentry=true
+  
+ * 
+ */
+
+class MDiscoverReply : public Message {
+  inodeno_t    base_ino;
+  bool         no_base_dir;     // no base dir (but IS dentry+inode)
+  bool         no_base_dentry;  // no base dentry (but IS inode)
+  bool        flag_error_dn;
+  bool        flag_error_dir;
+  string      error_dentry;   // dentry that was not found (to trigger waiters on asker)
+
+  
+  vector<CDirDiscover*>   dirs;      // not inode-aligned if no_base_dir = true.
+  filepath                path;      // not inode-aligned if no_base_dentry = true
+  vector<bool>            path_xlock;  
+  vector<CInodeDiscover*> inodes;
+
+ public:
+  // accessors
+  inodeno_t get_base_ino() { return base_ino; }
+  int       get_num_inodes() { return inodes.size(); }
+  int       get_num_dentries() { return path.depth(); }
+  int       get_num_dirs() { return dirs.size(); }
+
+  int       get_depth() {   // return depth of deepest object (in dir/dentry/inode units)
+    return max( inodes.size(),                                 // at least this many
+           max( no_base_dentry + path.depth() + flag_error_dn, // inode start + path + possible error
+                dirs.size() + no_base_dir ));                  // dn/inode + dirs
+  }
+
+  bool      has_base_dir() { return !no_base_dir && dirs.size(); }
+  bool      has_base_dentry() { return !no_base_dentry && path.depth(); }
+  bool has_root() {
+    if (base_ino == 0) {
+      assert(no_base_dir && no_base_dentry);
+      return true;
+    }
+    return false;
+  }
+  const string& get_path() { return path.get_path(); }
+  bool get_path_xlock(int i) { return path_xlock[i]; }
+
+  //  bool is_flag_forward() { return flag_forward; }
+  bool is_flag_error_dn() { return flag_error_dn; }
+  bool is_flag_error_dir() { return flag_error_dir; }
+  string& get_error_dentry() { return error_dentry; }
+
+  // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set.
+  CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); }
+  const string& get_dentry(int n) { return path[n - no_base_dentry]; }
+  bool get_dentry_xlock(int n) { return path_xlock[n - no_base_dentry]; }
+  CInodeDiscover& get_inode(int n) { return *(inodes[n]); }
+  inodeno_t get_ino(int n) { return inodes[n]->get_ino(); }
+
+  // cons
+  MDiscoverReply() {}
+  MDiscoverReply(inodeno_t base_ino) :
+    Message(MSG_MDS_DISCOVERREPLY) {
+    this->base_ino = base_ino;
+    flag_error_dn = false;
+    flag_error_dir = false;
+    no_base_dir = no_base_dentry = false;
+  }
+  ~MDiscoverReply() {
+    for (vector<CDirDiscover*>::iterator it = dirs.begin();
+         it != dirs.end();
+         it++) 
+      delete *it;
+    for (vector<CInodeDiscover*>::iterator it = inodes.begin();
+         it != inodes.end();
+         it++) 
+      delete *it;
+  }
+  virtual char *get_type_name() { return "DisR"; }
+  
+  // builders
+  bool is_empty() {
+    return dirs.empty() && path.depth() == 0 && 
+      inodes.empty() && 
+      !flag_error_dn &&
+      !flag_error_dir;
+  }
+  void set_path(const filepath& dp) { path = dp; }
+  void add_dentry(const string& dn, bool xlock) { 
+    if (path.depth() == 0 && dirs.empty()) no_base_dir = true;
+    path.add_dentry(dn);
+    path_xlock.push_back(xlock);
+  }
+
+  void add_inode(CInodeDiscover* din) {
+    if (inodes.empty() && path.depth() == 0) no_base_dir = no_base_dentry = true; 
+    inodes.push_back( din );
+  }
+
+  void add_dir(CDirDiscover* dir) {
+    dirs.push_back( dir );
+  }
+
+  //  void set_flag_forward() { flag_forward = true; }
+  void set_flag_error_dn(const string& dn) { 
+    flag_error_dn = true; 
+    error_dentry = dn; 
+  }
+  void set_flag_error_dir() { 
+    flag_error_dir = true; 
+  }
+
+
+  // ...
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(base_ino), (char*)&base_ino);
+    off += sizeof(base_ino);
+    payload.copy(off, sizeof(bool), (char*)&no_base_dir);
+    off += sizeof(bool);
+    payload.copy(off, sizeof(bool), (char*)&no_base_dentry);
+    off += sizeof(bool);
+    //    payload.copy(off, sizeof(bool), (char*)&flag_forward);
+    //off += sizeof(bool);
+    payload.copy(off, sizeof(bool), (char*)&flag_error_dn);
+    off += sizeof(bool);
+    
+    _decode(error_dentry, payload, off);
+    payload.copy(off, sizeof(bool), (char*)&flag_error_dir);
+    off += sizeof(bool);
+    
+    // dirs
+    int n;
+    payload.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      dirs.push_back( new CDirDiscover() );
+      dirs[i]->_decode(payload, off);
+    }
+    //dout(12) << n << " dirs out" << endl;
+
+    // inodes
+    payload.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      inodes.push_back( new CInodeDiscover() );
+      inodes[i]->_decode(payload, off);
+    }
+    //dout(12) << n << " inodes out" << endl;
+
+    // filepath
+    path._decode(payload, off);
+    //dout(12) << path.depth() << " dentries out" << endl;
+
+    // path_xlock
+    payload.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      bool b;
+      payload.copy(off, sizeof(bool), (char*)&b);
+      off += sizeof(bool);
+      path_xlock.push_back(b);
+    }
+  }
+  void encode_payload() {
+    payload.append((char*)&base_ino, sizeof(base_ino));
+    payload.append((char*)&no_base_dir, sizeof(bool));
+    payload.append((char*)&no_base_dentry, sizeof(bool));
+    //    payload.append((char*)&flag_forward, sizeof(bool));
+    payload.append((char*)&flag_error_dn, sizeof(bool));
+
+    _encode(error_dentry, payload);
+    payload.append((char*)&flag_error_dir, sizeof(bool));
+
+    // dirs
+    int n = dirs.size();
+    payload.append((char*)&n, sizeof(int));
+    for (vector<CDirDiscover*>::iterator it = dirs.begin();
+         it != dirs.end();
+         it++) 
+      (*it)->_encode( payload );
+    //dout(12) << n << " dirs in" << endl;
+    
+    // inodes
+    n = inodes.size();
+    payload.append((char*)&n, sizeof(int));
+    for (vector<CInodeDiscover*>::iterator it = inodes.begin();
+         it != inodes.end();
+         it++) 
+       (*it)->_encode( payload );
+    //dout(12) << n << " inodes in" << endl;
+
+    // path
+    path._encode( payload );
+    //dout(12) << path.depth() << " dentries in" << endl;
+
+    // path_xlock
+    n = path_xlock.size();
+    payload.append((char*)&n, sizeof(int));
+    for (vector<bool>::iterator it = path_xlock.begin();
+         it != path_xlock.end();
+         it++) {
+      bool b = *it;
+      payload.append((char*)&b, sizeof(bool));
+    }
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDir.h b/branches/sage/cephmds2/messages/MExportDir.h
new file mode 100644
index 0000000000000..2879579f6929f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDir.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIR_H
+#define __MEXPORTDIR_H
+
+#include "msg/Message.h"
+
+
+class MExportDir : public Message {
+  inodeno_t ino;
+  
+  int         ndirs;
+  bufferlist  state;
+  
+  list<inodeno_t> exports;
+
+  // hashed pre-discovers
+  //map<inodeno_t, set<string> > hashed_prediscover;
+
+ public:  
+  MExportDir() {}
+  MExportDir(CInode *in) : 
+    Message(MSG_MDS_EXPORTDIR) {
+    this->ino = in->inode.ino;
+    ndirs = 0;
+  }
+  virtual char *get_type_name() { return "Ex"; }
+
+  inodeno_t get_ino() { return ino; }
+  int get_ndirs() { return ndirs; }
+  bufferlist& get_state() { return state; }
+  list<inodeno_t>& get_exports() { return exports; }
+  
+  void add_dir(bufferlist& dir) {
+    state.claim_append( dir );
+    ndirs++;
+  }
+  void add_export(CDir *dir) { exports.push_back(dir->ino()); }
+
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    payload.copy(off, sizeof(ndirs), (char*)&ndirs);
+    off += sizeof(ndirs);
+
+    // exports
+    int nex;
+    payload.copy(off, sizeof(nex), (char*)&nex);
+    off += sizeof(int);
+    dout(12) << nex << " nested exports out" << endl;
+    for (int i=0; i<nex; i++) {
+      inodeno_t dirino;
+      payload.copy(off, sizeof(dirino), (char*)&dirino);
+      off += sizeof(dirino);
+      exports.push_back(dirino);
+    }
+
+    // dir data
+    size_t len;
+    payload.copy(off, sizeof(len), (char*)&len);
+    off += sizeof(len);
+    state.substr_of(payload, off, len);
+    off += len;
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    payload.append((char*)&ndirs, sizeof(ndirs));
+
+    // exports
+    int nex = exports.size();
+    dout(12) << nex << " nested exports in" << endl;
+    payload.append((char*)&nex, sizeof(int));
+    for (list<inodeno_t>::iterator it = exports.begin();
+         it != exports.end();
+         it++) {
+      inodeno_t ino = *it;
+      payload.append((char*)&ino, sizeof(ino));
+    }
+    
+    // dir data
+    size_t len = state.length();
+    payload.append((char*)&len, sizeof(len));
+    payload.claim_append(state);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirAck.h b/branches/sage/cephmds2/messages/MExportDirAck.h
new file mode 100644
index 0000000000000..35691bf94e2a7
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRACK_H
+#define __MEXPORTDIRACK_H
+
+#include "MExportDir.h"
+
+class MExportDirAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MExportDirAck() {}
+  MExportDirAck(MExportDir *req) :
+    Message(MSG_MDS_EXPORTDIRACK) {
+    ino = req->get_ino();
+  }  
+  virtual char *get_type_name() { return "ExAck"; }
+  
+  virtual void decode_payload(crope& s) {
+    s.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirDiscover.h b/branches/sage/cephmds2/messages/MExportDirDiscover.h
new file mode 100644
index 0000000000000..24f77036455f4
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirDiscover.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRDISCOVER_H
+#define __MEXPORTDIRDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirDiscover : public Message {
+  inodeno_t ino;
+  string path;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  string& get_path() { return path; }
+
+  MExportDirDiscover() {}
+  MExportDirDiscover(CInode *in) : 
+    Message(MSG_MDS_EXPORTDIRDISCOVER) {
+    in->make_path(path);
+    ino = in->ino();
+  }
+  virtual char *get_type_name() { return "ExDis"; }
+
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    _unrope(path, s, off);
+  }
+
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+    _rope(path, s);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h
new file mode 100644
index 0000000000000..a25e3b46672e3
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRDISCOVERACK_H
+#define __MEXPORTDIRDISCOVERACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirDiscoverAck : public Message {
+  inodeno_t ino;
+  bool success;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  bool is_success() { return success; }
+
+  MExportDirDiscoverAck() {}
+  MExportDirDiscoverAck(inodeno_t ino, bool success=true) : 
+    Message(MSG_MDS_EXPORTDIRDISCOVERACK) {
+    this->ino = ino;
+    this->success = false;
+  }
+  virtual char *get_type_name() { return "ExDisA"; }
+
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    s.copy(off, sizeof(success), (char*)&success);
+    off += sizeof(success);
+  }
+
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+    s.append((char*)&success, sizeof(success));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirFinish.h b/branches/sage/cephmds2/messages/MExportDirFinish.h
new file mode 100644
index 0000000000000..89c9e5290c4b2
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirFinish.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRFINISH_H
+#define __MEXPORTDIRFINISH_H
+
+#include "MExportDir.h"
+
+class MExportDirFinish : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MExportDirFinish() {}
+  MExportDirFinish(inodeno_t ino) :
+    Message(MSG_MDS_EXPORTDIRFINISH) {
+    this->ino = ino;
+  }  
+  virtual char *get_type_name() { return "ExFin"; }
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirNotify.h b/branches/sage/cephmds2/messages/MExportDirNotify.h
new file mode 100644
index 0000000000000..9d6532cad478c
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirNotify.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRNOTIFY_H
+#define __MEXPORTDIRNOTIFY_H
+
+#include "msg/Message.h"
+#include <string>
+using namespace std;
+
+class MExportDirNotify : public Message {
+  int       new_auth;
+  int       old_auth;
+  inodeno_t ino;
+  
+  list<inodeno_t> exports;  // bounds; these dirs are _not_ included (tho the inodes are)
+  list<inodeno_t> subdirs;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  int get_new_auth() { return new_auth; }
+  int get_old_auth() { return old_auth; }
+  list<inodeno_t>& get_exports() { return exports; }
+  list<inodeno_t>::iterator subdirs_begin() { return subdirs.begin(); }
+  list<inodeno_t>::iterator subdirs_end() { return subdirs.end(); }
+  int num_subdirs() { return subdirs.size(); }
+
+  MExportDirNotify() {}
+  MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) :
+    Message(MSG_MDS_EXPORTDIRNOTIFY) {
+    this->ino = ino;
+    this->old_auth = old_auth;
+    this->new_auth = new_auth;
+  }
+  virtual char *get_type_name() { return "ExNot"; }
+  
+  void copy_subdirs(list<inodeno_t>& s) {
+    this->subdirs = s;
+  }
+  void copy_exports(list<inodeno_t>& ex) {
+    this->exports = ex;
+  }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(int), (char*)&new_auth);
+    off += sizeof(int);
+    s.copy(off, sizeof(int), (char*)&old_auth);
+    off += sizeof(int);
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+
+    // notify
+    int n;
+    s.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      inodeno_t ino;
+      s.copy(off, sizeof(ino), (char*)&ino);
+      exports.push_back(ino);
+      off += sizeof(inodeno_t);
+    }
+    
+    // subdirs
+    s.copy(off, sizeof(int), (char*)&n);
+    off += sizeof(int);
+    for (int i=0; i<n; i++) {
+      inodeno_t ino;
+      s.copy(off, sizeof(ino), (char*)&ino);
+      subdirs.push_back(ino);
+      off += sizeof(inodeno_t);
+    }
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&new_auth, sizeof(int));
+    s.append((char*)&old_auth, sizeof(int));
+    s.append((char*)&ino, sizeof(ino));
+
+    // notify
+    int n = exports.size();
+    s.append((char*)&n, sizeof(int));
+    for (list<inodeno_t>::iterator it = exports.begin();
+         it != exports.end();
+         it++) {
+      inodeno_t ino = *it;
+      s.append((char*)&ino, sizeof(ino));
+    }
+
+    // subdirs
+    n = subdirs.size();
+    s.append((char*)&n, sizeof(int));
+    for (list<inodeno_t>::iterator it = subdirs.begin();
+         it != subdirs.end();
+         it++) {
+      inodeno_t ino = *it;
+      s.append((char*)&ino, sizeof(ino));
+    }
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirNotifyAck.h b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h
new file mode 100644
index 0000000000000..3179fd4f544f1
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRNOTIFYACK_H
+#define __MEXPORTDIRNOTIFYACK_H
+
+#include "msg/Message.h"
+#include <string>
+using namespace std;
+
+class MExportDirNotifyAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MExportDirNotifyAck() {}
+  MExportDirNotifyAck(inodeno_t ino) :
+    Message(MSG_MDS_EXPORTDIRNOTIFYACK) {
+    this->ino = ino;
+  }
+  virtual char *get_type_name() { return "ExNotA"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+  }
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirPrep.h b/branches/sage/cephmds2/messages/MExportDirPrep.h
new file mode 100644
index 0000000000000..6e814212ac98b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirPrep.h
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRPREP_H
+#define __MEXPORTDIRPREP_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirPrep : public Message {
+  inodeno_t ino;
+
+  /* nested export discover payload.
+     not all inodes will have dirs; they may require a separate discover.
+     dentries are the links to each inode.
+     dirs map includes base dir (ino)
+  */
+  list<inodeno_t>                exports;
+
+  list<CInodeDiscover*>          inodes;
+  map<inodeno_t,inodeno_t>       inode_dirino;
+  map<inodeno_t,string>          inode_dentry;
+
+  map<inodeno_t,CDirDiscover*>   dirs;
+
+  bool b_did_assim;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  list<inodeno_t>& get_exports() { return exports; }
+  list<CInodeDiscover*>& get_inodes() { return inodes; }
+  inodeno_t get_containing_dirino(inodeno_t ino) {
+    return inode_dirino[ino];
+  }
+  string& get_dentry(inodeno_t ino) {
+    return inode_dentry[ino];
+  }
+  bool have_dir(inodeno_t ino) {
+    return dirs.count(ino);
+  }
+  CDirDiscover* get_dir(inodeno_t ino) {
+    return dirs[ino];
+  }
+
+  bool did_assim() { return b_did_assim; }
+  void mark_assim() { b_did_assim = true; }
+
+  MExportDirPrep() {
+    b_did_assim = false;
+  }
+  MExportDirPrep(CInode *in) : 
+    Message(MSG_MDS_EXPORTDIRPREP) {
+    ino = in->ino();
+    b_did_assim = false;
+  }
+  ~MExportDirPrep() {
+    for (list<CInodeDiscover*>::iterator iit = inodes.begin();
+         iit != inodes.end();
+         iit++)
+      delete *iit;
+    for (map<inodeno_t,CDirDiscover*>::iterator dit = dirs.begin();
+         dit != dirs.end();
+         dit++) 
+      delete dit->second;
+  }
+
+
+  virtual char *get_type_name() { return "ExP"; }
+
+
+
+
+  void add_export(inodeno_t dirino) {
+    exports.push_back( dirino );
+  }
+  void add_inode(inodeno_t dirino, string& dentry, CInodeDiscover *in) {
+    inodes.push_back(in);
+    inode_dirino.insert(pair<inodeno_t, inodeno_t>(in->get_ino(), dirino));
+    inode_dentry.insert(pair<inodeno_t, string>(in->get_ino(), dentry));
+  }
+  void add_dir(CDirDiscover *dir) {
+    dirs.insert(pair<inodeno_t, CDirDiscover*>(dir->get_ino(), dir));
+  }
+
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    
+    // exports
+    int ne;
+    payload.copy(off, sizeof(int), (char*)&ne);
+    off += sizeof(int);
+    for (int i=0; i<ne; i++) {
+      inodeno_t ino;
+      payload.copy(off, sizeof(ino), (char*)&ino);
+      off += sizeof(ino);
+      exports.push_back(ino);
+    }
+
+    // inodes
+    int ni;
+    payload.copy(off, sizeof(int), (char*)&ni);
+    off += sizeof(int);
+    for (int i=0; i<ni; i++) {
+      // inode
+      CInodeDiscover *in = new CInodeDiscover;
+      in->_decode(payload, off);
+      inodes.push_back(in);
+      
+      // dentry
+      string d;
+      _decode(d, payload, off);
+      inode_dentry[in->get_ino()] = d;
+      
+      // dir ino
+      inodeno_t dino;
+      payload.copy(off, sizeof(dino), (char*)&dino);
+      off += sizeof(dino);
+      inode_dirino[in->get_ino()] = dino;
+    }
+
+    // dirs
+    int nd;
+    payload.copy(off, sizeof(int), (char*)&nd);
+    off += sizeof(int);
+    for (int i=0; i<nd; i++) {
+      CDirDiscover *dir = new CDirDiscover;
+      dir->_decode(payload, off);
+      dirs[dir->get_ino()] = dir;
+    }
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+
+    // exports
+    int ne = exports.size();
+    payload.append((char*)&ne, sizeof(int));
+    for (list<inodeno_t>::iterator it = exports.begin();
+         it != exports.end();
+         it++) {
+      inodeno_t ino = *it;
+      payload.append((char*)&ino, sizeof(ino));
+    }
+
+    // inodes
+    int ni = inodes.size();
+    payload.append((char*)&ni, sizeof(int));
+    for (list<CInodeDiscover*>::iterator iit = inodes.begin();
+         iit != inodes.end();
+         iit++) {
+      (*iit)->_encode(payload);
+      
+      // dentry
+      _encode(inode_dentry[(*iit)->get_ino()], payload);
+
+      // dir ino
+      inodeno_t ino = inode_dirino[(*iit)->get_ino()];
+      payload.append((char*)&ino, sizeof(ino));
+    }
+
+    // dirs
+    int nd = dirs.size();
+    payload.append((char*)&nd, sizeof(int));
+    for (map<inodeno_t,CDirDiscover*>::iterator dit = dirs.begin();
+         dit != dirs.end();
+         dit++)
+      dit->second->_encode(payload);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirPrepAck.h b/branches/sage/cephmds2/messages/MExportDirPrepAck.h
new file mode 100644
index 0000000000000..c32d7255c5074
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirPrepAck.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRPREPACK_H
+#define __MEXPORTDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MExportDirPrepAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MExportDirPrepAck() {}
+  MExportDirPrepAck(inodeno_t ino) :
+    Message(MSG_MDS_EXPORTDIRPREPACK) {
+    this->ino = ino;
+  }
+  
+  virtual char *get_type_name() { return "ExPAck"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MExportDirWarning.h b/branches/sage/cephmds2/messages/MExportDirWarning.h
new file mode 100644
index 0000000000000..6f2fdf55dde4f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MExportDirWarning.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MEXPORTDIRWARNING_H
+#define __MEXPORTDIRWARNING_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirWarning : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+
+  MExportDirWarning() {}
+  MExportDirWarning(inodeno_t ino) : 
+    Message(MSG_MDS_EXPORTDIRWARNING) {
+    this->ino = ino;
+  }
+
+  virtual char *get_type_name() { return "ExW"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino, sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MFailure.h b/branches/sage/cephmds2/messages/MFailure.h
new file mode 100644
index 0000000000000..1663565b692dd
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MFailure.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MFAILURE_H
+#define __MFAILURE_H
+
+#include "msg/Message.h"
+
+
+class MFailure : public Message {
+ public:
+  msg_addr_t failed;
+  entity_inst_t inst;
+
+  MFailure() {}
+  MFailure(msg_addr_t f, entity_inst_t& i) : 
+    Message(MSG_FAILURE),
+    failed(f), inst(i) {}
+ 
+  msg_addr_t get_failed() { return failed; }
+  entity_inst_t& get_inst() { return inst; }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(failed), (char*)&failed);
+    off += sizeof(failed);
+    payload.copy(off, sizeof(inst), (char*)&inst);
+    off += sizeof(inst);
+  }
+  void encode_payload() {
+    payload.append((char*)&failed, sizeof(failed));
+    payload.append((char*)&inst, sizeof(inst));
+  }
+
+  virtual char *get_type_name() { return "fail"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MFailureAck.h b/branches/sage/cephmds2/messages/MFailureAck.h
new file mode 100644
index 0000000000000..ee9a0d04d0fd4
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MFailureAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MFAILUREACK_H
+#define __MFAILUREACK_H
+
+#include "MFailure.h"
+
+
+class MFailureAck : public Message {
+ public:
+  msg_addr_t failed;
+  MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) {
+    this->failed = m->get_failed();
+  }
+  MFailureAck() {}
+ 
+  msg_addr_t get_failed() { return failed; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(0, sizeof(failed), (char*)&failed);
+    off += sizeof(failed);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&failed, sizeof(failed));
+  }
+
+  virtual char *get_type_name() { return "faila"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MGenericMessage.h b/branches/sage/cephmds2/messages/MGenericMessage.h
new file mode 100644
index 0000000000000..b2f39534e6e23
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MGenericMessage.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MGENERICMESSAGE_H
+#define __MGENERICMESSAGE_H
+
+#include "msg/Message.h"
+
+class MGenericMessage : public Message {
+  char tname[20];
+  //long pcid;
+
+ public:
+  MGenericMessage(int t) : Message(t) { 
+    sprintf(tname, "generic%d", get_type());
+  }
+
+  //void set_pcid(long pcid) { this->pcid = pcid; }
+  //long get_pcid() { return pcid; }
+
+  char *get_type_name() { return tname; }
+
+  virtual void decode_payload() {
+    //int off = 0;
+    //payload.copy(off, sizeof(pcid), (char*)&pcid);
+    //off += sizeof(pcid);
+  }
+  virtual void encode_payload() {
+    //payload.append((char*)&pcid, sizeof(pcid));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDir.h b/branches/sage/cephmds2/messages/MHashDir.h
new file mode 100644
index 0000000000000..ddf7e3ac2bbce
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDir.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIR_H
+#define __MHASHDIR_H
+
+#include "msg/Message.h"
+
+class MHashDir : public Message {
+  inodeno_t ino;
+  bufferlist state;
+  int nden;
+  
+ public:  
+  MHashDir() {}
+  MHashDir(inodeno_t ino) : 
+    Message(MSG_MDS_HASHDIR) {
+    this->ino = ino;
+    nden = 0;
+  }
+  virtual char *get_type_name() { return "Ha"; }
+
+  inodeno_t get_ino() { return ino; }
+  bufferlist& get_state() { return state; }
+  bufferlist* get_state_ptr() { return &state; }
+  int       get_nden() { return nden; }
+  
+  void set_nden(int n) { nden = n; }
+  void inc_nden() { nden++; }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    payload.copy(off, sizeof(nden), (char*)&nden);
+    off += sizeof(nden);
+
+    size_t len;
+    payload.copy(off, sizeof(len), (char*)&len);
+    off += sizeof(len);
+    state.substr_of(payload, off, len);
+  }
+  void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    payload.append((char*)&nden, sizeof(nden));
+    size_t size = state.length();
+    payload.append((char*)&size, sizeof(size));
+    payload.claim_append(state);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirAck.h b/branches/sage/cephmds2/messages/MHashDirAck.h
new file mode 100644
index 0000000000000..cd6d4da8cf34f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRACK_H
+#define __MHASHDIRACK_H
+
+#include "MHashDir.h"
+
+class MHashDirAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MHashDirAck() {}
+  MHashDirAck(inodeno_t ino) :
+    Message(MSG_MDS_HASHDIRACK) {
+    this->ino = ino;
+  }  
+  virtual char *get_type_name() { return "HAck"; }
+  
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirDiscover.h b/branches/sage/cephmds2/messages/MHashDirDiscover.h
new file mode 100644
index 0000000000000..0ea1ff8b79990
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirDiscover.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRDISCOVER_H
+#define __MHASHDIRDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirDiscover : public Message {
+  inodeno_t ino;
+  string path;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  string& get_path() { return path; }
+
+  MHashDirDiscover() {}
+  MHashDirDiscover(CInode *in) : 
+    Message(MSG_MDS_HASHDIRDISCOVER) {
+    in->make_path(path);
+    ino = in->ino();
+  }
+  virtual char *get_type_name() { return "HDis"; }
+
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    _decode(path, payload, off);
+  }
+
+  void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    _encode(path, payload);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h b/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h
new file mode 100644
index 0000000000000..34734af0f97ad
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRDISCOVERACK_H
+#define __MHASHDIRDISCOVERACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirDiscoverAck : public Message {
+  inodeno_t ino;
+  bool success;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  bool is_success() { return success; }
+
+  MHashDirDiscoverAck() {}
+  MHashDirDiscoverAck(inodeno_t ino, bool success=true) : 
+    Message(MSG_MDS_HASHDIRDISCOVERACK) {
+    this->ino = ino;
+    this->success = false;
+  }
+  virtual char *get_type_name() { return "HDisA"; }
+
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    payload.copy(off, sizeof(success), (char*)&success);
+    off += sizeof(success);
+  }
+
+  void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    payload.append((char*)&success, sizeof(success));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirNotify.h b/branches/sage/cephmds2/messages/MHashDirNotify.h
new file mode 100644
index 0000000000000..ececc3ec2cc65
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirNotify.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRNOTIFY_H
+#define __MHASHDIRNOTIFY_H
+
+#include "msg/Message.h"
+
+class MHashDirNotify : public Message {
+  inodeno_t ino;
+  int from;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  int get_from() { return from; }
+
+  MHashDirNotify() {}
+  MHashDirNotify(inodeno_t ino, int from) :
+    Message(MSG_MDS_HASHDIRNOTIFY) {
+    this->ino = ino;
+    this->from = from;
+  }  
+  virtual char *get_type_name() { return "HN"; }
+  
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    payload.copy(off, sizeof(from), (char*)&from);
+    off += sizeof(from);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    payload.append((char*)&from, sizeof(from));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirPrep.h b/branches/sage/cephmds2/messages/MHashDirPrep.h
new file mode 100644
index 0000000000000..29a42217d6a4b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirPrep.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRPREP_H
+#define __MHASHDIRPREP_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirPrep : public Message {
+  inodeno_t ino;
+  bool assim;
+
+  // subdir dentry names + inodes 
+  map<string,CInodeDiscover*>    inodes;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  map<string,CInodeDiscover*>& get_inodes() { return inodes; }
+
+  bool did_assim() { return assim; }
+  void mark_assim() { assert(!assim); assim = true; }
+
+  MHashDirPrep() : assim(false) { }
+  MHashDirPrep(inodeno_t ino) :
+    Message(MSG_MDS_HASHDIRPREP),
+    assim(false) {
+    this->ino = ino;
+  }
+  ~MHashDirPrep() {
+    for (map<string,CInodeDiscover*>::iterator it = inodes.begin();
+         it != inodes.end();
+         it++) 
+      delete it->second;
+  }
+
+
+  virtual char *get_type_name() { return "HP"; }
+
+  void add_inode(const string& dentry, CInodeDiscover *in) {
+    inodes[dentry] = in;
+  }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    
+    // inodes
+    int ni;
+    payload.copy(off, sizeof(int), (char*)&ni);
+    off += sizeof(int);
+    for (int i=0; i<ni; i++) {
+      // dentry
+      string dname;
+      _decode(dname, payload, off);
+      
+      // inode
+      CInodeDiscover *in = new CInodeDiscover;
+      in->_decode(payload, off);
+      
+      inodes[dname] = in;
+    }
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+
+    // inodes
+    int ni = inodes.size();
+    payload.append((char*)&ni, sizeof(int));
+    for (map<string,CInodeDiscover*>::iterator iit = inodes.begin();
+         iit != inodes.end();
+         iit++) {
+      _encode(iit->first, payload);   // dentry
+      iit->second->_encode(payload);  // inode
+    }
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashDirPrepAck.h b/branches/sage/cephmds2/messages/MHashDirPrepAck.h
new file mode 100644
index 0000000000000..1d0db35c10f88
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashDirPrepAck.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHDIRPREPACK_H
+#define __MHASHDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MHashDirPrepAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MHashDirPrepAck() {}
+  MHashDirPrepAck(inodeno_t ino) :
+    Message(MSG_MDS_HASHDIRPREPACK) {
+    this->ino = ino;
+  }
+  
+  virtual char *get_type_name() { return "HPAck"; }
+
+  void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashReaddir.h b/branches/sage/cephmds2/messages/MHashReaddir.h
new file mode 100644
index 0000000000000..864cb6944aeda
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashReaddir.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHREADDIR_H
+#define __MHASHREADDIR_H
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+class MHashReaddir : public Message {
+  inodeno_t ino;
+
+ public:
+  MHashReaddir() { }
+  MHashReaddir(inodeno_t ino) :
+    Message(MSG_MDS_HASHREADDIR) {
+    this->ino = ino;
+  }
+
+  inodeno_t get_ino() { return ino; }
+
+  virtual char *get_type_name() { return "Hls"; }
+
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHashReaddirReply.h b/branches/sage/cephmds2/messages/MHashReaddirReply.h
new file mode 100644
index 0000000000000..d9d73d8528f00
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHashReaddirReply.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHASHREADDIRREPLY_H
+#define __MHASHREADDIRREPLY_H
+
+#include "MClientReply.h"
+
+class MHashReaddirReply : public Message {
+  inodeno_t ino;
+
+  list<InodeStat*> dir_in;
+  list<string>     dir_dn;
+  
+  int num;
+
+ public:
+  MHashReaddirReply() { }
+  MHashReaddirReply(inodeno_t _ino, list<InodeStat*>& inls, list<string>& dnls, int n) :
+    Message(MSG_MDS_HASHREADDIRREPLY),
+    ino(_ino),
+    num(n) {
+    dir_in.swap(inls);
+    dir_dn.swap(dnls);
+  }
+  ~MHashReaddirReply() {
+    for (list<InodeStat*>::iterator it = dir_in.begin(); it != dir_in.end(); it++) 
+      delete *it;
+  }
+
+  inodeno_t get_ino() { return ino; }
+  list<InodeStat*>& get_in() { return dir_in; }
+  list<string>& get_dn() { return dir_dn; }
+
+  virtual char *get_type_name() { return "Hls"; }
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    int n;
+    payload.copy(n, sizeof(n), (char*)&n);
+    off += sizeof(n);
+    for (int i=0; i<n; i++) {
+      string dn;
+      ::_decode(dn, payload, off);
+      dir_dn.push_back(dn);
+
+      InodeStat *ci = new InodeStat;
+      ci->_decode(payload, off);
+      dir_in.push_back(ci);
+    }
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    int n = dir_in.size();                           // FIXME?
+    payload.append((char*)&n, sizeof(n));
+    list<string>::iterator pdn = dir_dn.begin();
+    for (list<InodeStat*>::iterator pin = dir_in.begin(); 
+         pin != dir_in.end(); 
+         ++pin, ++pdn) {
+      ::_encode(*pdn, payload);
+      (*pin)->_encode(payload);
+    }
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MHeartbeat.h b/branches/sage/cephmds2/messages/MHeartbeat.h
new file mode 100644
index 0000000000000..55455f406ef18
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MHeartbeat.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MHEARTBEAT_H
+#define __MHEARTBEAT_H
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+class MHeartbeat : public Message {
+  mds_load_t load;
+  int        beat;
+  map<int, float> import_map;
+
+ public:
+  mds_load_t& get_load() { return load; }
+  int get_beat() { return beat; }
+
+  map<int, float>& get_import_map() {
+    return import_map;
+  }
+
+  MHeartbeat() {}
+  MHeartbeat(mds_load_t& load, int beat) :
+    Message(MSG_MDS_HEARTBEAT) {
+    this->load = load;
+    this->beat = beat;
+  }
+
+  virtual char *get_type_name() { return "HB"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off,sizeof(load), (char*)&load);
+    off += sizeof(load);
+    s.copy(off, sizeof(beat), (char*)&beat);
+    off += sizeof(beat);
+
+    int n;
+    s.copy(off, sizeof(n), (char*)&n);
+    off += sizeof(n);
+    while (n--) {
+      int f;
+      s.copy(off, sizeof(f), (char*)&f);
+      off += sizeof(f);
+      float v;
+      s.copy(off, sizeof(v), (char*)&v);
+      off += sizeof(v);      
+      import_map[f] = v;
+    }
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&load, sizeof(load));
+    s.append((char*)&beat, sizeof(beat));
+
+    int n = import_map.size();
+    s.append((char*)&n, sizeof(n));
+    for (map<int, float>::iterator it = import_map.begin();
+         it != import_map.end();
+         it++) {
+      int f = it->first;
+      s.append((char*)&f, sizeof(f));
+      float v = it->second;
+      s.append((char*)&v, sizeof(v));
+    }
+
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeExpire.h b/branches/sage/cephmds2/messages/MInodeExpire.h
new file mode 100644
index 0000000000000..637f378324022
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeExpire.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODEEXPIRE_H
+#define __MINODEEXPIRE_H
+
+typedef struct {
+  inodeno_t ino;
+  int nonce;
+  int from;
+} MInodeExpire_st;
+
+class MInodeExpire : public Message {
+  MInodeExpire_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_from() { return st.from; }
+  int get_nonce() { return st.nonce; }
+
+  MInodeExpire() {}
+  MInodeExpire(inodeno_t ino, int from, int nonce) :
+    Message(MSG_MDS_INODEEXPIRE) {
+    st.ino = ino;
+    st.from = from;
+    st.nonce = nonce;
+  }
+  virtual char *get_type_name() { return "InEx";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeFileCaps.h b/branches/sage/cephmds2/messages/MInodeFileCaps.h
new file mode 100644
index 0000000000000..5bd51be0e347b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeFileCaps.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODEFILECAPS_H
+#define __MINODEFILECAPS_H
+
+class MInodeFileCaps : public Message {
+  inodeno_t ino;
+  int       from;
+  int       caps;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  int       get_from() { return from; }
+  int       get_caps() { return caps; }
+
+  MInodeFileCaps() {}
+  // from auth
+  MInodeFileCaps(inodeno_t ino, int from, int caps) :
+    Message(MSG_MDS_INODEFILECAPS) {
+
+    this->ino = ino;
+    this->from = from;
+    this->caps = caps;
+  }
+
+  virtual char *get_type_name() { return "Icap";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(from), (char*)&from);
+    off += sizeof(from);
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    s.copy(off, sizeof(caps), (char*)&caps);
+    off += sizeof(caps);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&from, sizeof(from));
+    s.append((char*)&ino, sizeof(ino));
+    s.append((char*)&caps, sizeof(caps));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeLink.h b/branches/sage/cephmds2/messages/MInodeLink.h
new file mode 100644
index 0000000000000..feefc4ea21c7b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeLink.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODELINK_H
+#define __MINODELINK_H
+
+typedef struct {
+  inodeno_t ino;
+  int from;
+} MInodeLink_st;
+
+class MInodeLink : public Message {
+  MInodeLink_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_from() { return st.from; }
+
+  MInodeLink() {}
+  MInodeLink(inodeno_t ino, int from) :
+    Message(MSG_MDS_INODELINK) {
+    st.ino = ino;
+    st.from = from;
+  }
+  virtual char *get_type_name() { return "InL";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeLinkAck.h b/branches/sage/cephmds2/messages/MInodeLinkAck.h
new file mode 100644
index 0000000000000..987b70741edcb
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeLinkAck.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODELINKACK_H
+#define __MINODELINKACK_H
+
+typedef struct {
+  inodeno_t ino;
+  bool success;
+} MInodeLinkAck_st;
+
+class MInodeLinkAck : public Message {
+  MInodeLinkAck_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  bool is_success() { return st.success; }
+
+  MInodeLinkAck() {}
+  MInodeLinkAck(inodeno_t ino, bool success) :
+    Message(MSG_MDS_INODELINKACK) {
+    st.ino = ino;
+    st.success = success;
+  }
+  virtual char *get_type_name() { return "InLA";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeUnlink.h b/branches/sage/cephmds2/messages/MInodeUnlink.h
new file mode 100644
index 0000000000000..e1aa463153c26
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeUnlink.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODEUNLINK_H
+#define __MINODEUNLINK_H
+
+typedef struct {
+  inodeno_t ino;
+  int from;
+} MInodeUnlink_st;
+
+class MInodeUnlink : public Message {
+  MInodeUnlink_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+  int get_from() { return st.from; }
+
+  MInodeUnlink() {}
+  MInodeUnlink(inodeno_t ino, int from) :
+    Message(MSG_MDS_INODEUNLINK) {
+    st.ino = ino;
+    st.from = from;
+  }
+  virtual char *get_type_name() { return "InUl";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeUnlinkAck.h b/branches/sage/cephmds2/messages/MInodeUnlinkAck.h
new file mode 100644
index 0000000000000..283c016f2bec9
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeUnlinkAck.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODEUNLINKACK_H
+#define __MINODEUNLINKACK_H
+
+typedef struct {
+  inodeno_t ino;
+} MInodeUnlinkAck_st;
+
+class MInodeUnlinkAck : public Message {
+  MInodeUnlinkAck_st st;
+
+ public:
+  inodeno_t get_ino() { return st.ino; }
+
+  MInodeUnlinkAck() {}
+  MInodeUnlinkAck(inodeno_t ino) :
+    Message(MSG_MDS_INODEUNLINKACK) {
+    st.ino = ino;
+  }
+  virtual char *get_type_name() { return "InUlA";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&st,sizeof(st));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MInodeUpdate.h b/branches/sage/cephmds2/messages/MInodeUpdate.h
new file mode 100644
index 0000000000000..bbab924089aa5
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MInodeUpdate.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MINODEUPDATE_H
+#define __MINODEUPDATE_H
+
+#include "msg/Message.h"
+
+#include <set>
+using namespace std;
+
+class MInodeUpdate : public Message {
+  int nonce;
+  crope inode_basic_state;
+
+ public:
+  inodeno_t get_ino() { 
+    inodeno_t ino;
+    inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino);
+    return ino;
+  }
+  int get_nonce() { return nonce; }
+  
+  MInodeUpdate() {}
+  MInodeUpdate(CInode *in, int nonce) :
+    Message(MSG_MDS_INODEUPDATE) {
+    inode_basic_state = in->encode_basic_state();
+    this->nonce = nonce;
+  }
+  virtual char *get_type_name() { return "Iup"; }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(int), (char*)&nonce);
+    off += sizeof(int);
+    size_t len;
+    s.copy(off, sizeof(len), (char*)&len);
+    off += sizeof(len);
+    inode_basic_state = s.substr(off, len);
+    off += len;
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&nonce, sizeof(int));
+    size_t len = inode_basic_state.length();
+    s.append((char*)&len, sizeof(len));
+    s.append(inode_basic_state);
+  }
+      
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MLock.h b/branches/sage/cephmds2/messages/MLock.h
new file mode 100644
index 0000000000000..1d22d297d79d4
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MLock.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MLOCK_H
+#define __MLOCK_H
+
+#include "msg/Message.h"
+
+#define LOCK_OTYPE_IHARD  1
+#define LOCK_OTYPE_IFILE  2
+#define LOCK_OTYPE_DIR    3
+#define LOCK_OTYPE_DN     4
+
+// for replicas
+#define LOCK_AC_SYNC          0
+#define LOCK_AC_MIXED         1
+#define LOCK_AC_LOCK          2
+
+#define LOCK_AC_REQXLOCKACK   9  // req dentry xlock
+#define LOCK_AC_REQXLOCKNAK  10  // req dentry xlock
+#define LOCK_AC_LOCKNAK      12  // for dentry xlock
+
+
+#define LOCK_AC_FOR_REPLICA(a)  ((a) <= 10)
+#define LOCK_AC_FOR_AUTH(a)     ((a) >= 11)
+
+// for auth
+
+#define LOCK_AC_SYNCACK      13
+#define LOCK_AC_MIXEDACK     14
+#define LOCK_AC_LOCKACK      15
+
+
+#define LOCK_AC_REQREAD      19
+#define LOCK_AC_REQWRITE     20
+
+#define LOCK_AC_REQXLOCK     21
+#define LOCK_AC_REQXLOCKC    22 // create if necessary
+#define LOCK_AC_UNXLOCK      23
+
+#define lock_ac_name(x)      
+
+
+class MLock : public Message {
+  int       asker;  // who is initiating this request
+  int       action;  // action type
+
+  char      otype;  // lock object type
+  inodeno_t ino;    // ino ref, or possibly
+  string    dn;     // dentry name
+  bufferlist data;   // and possibly some data
+  string    path;   // possibly a path too (for dentry lock discovers)
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  string& get_dn() { return dn; }
+  bufferlist& get_data() { return data; }
+  int get_asker() { return asker; }
+  int get_action() { return action; }
+  int get_otype() { return otype; }
+  string& get_path() { return path; }
+
+  MLock() {}
+  MLock(int action, int asker) :
+    Message(MSG_MDS_LOCK) {
+    this->action = action;
+    this->asker = asker;
+  }
+  virtual char *get_type_name() { return "ILock"; }
+  
+  void set_ino(inodeno_t ino, char ot) {
+    otype = ot;
+    this->ino = ino;
+  }
+  void set_dirino(inodeno_t dirino) {
+    otype = LOCK_OTYPE_DIR;
+    this->ino = ino;
+  }
+  void set_dn(inodeno_t dirino, string& dn) {
+    otype = LOCK_OTYPE_DN;
+    this->ino = dirino;
+    this->dn = dn;
+  }
+  void set_data(bufferlist& data) {
+    this->data.claim( data );
+  }
+  void set_path(const string& p) {
+    path = p;
+  }
+  
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off,sizeof(action), (char*)&action);
+    off += sizeof(action);
+    payload.copy(off,sizeof(asker), (char*)&asker);
+    off += sizeof(asker);
+    payload.copy(off,sizeof(otype), (char*)&otype);
+    off += sizeof(otype);
+    payload.copy(off,sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    ::_decode(dn, payload, off);
+    ::_decode(path, payload, off);
+    ::_decode(data, payload, off);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&action, sizeof(action));
+    payload.append((char*)&asker, sizeof(asker));
+    payload.append((char*)&otype, sizeof(otype));
+    payload.append((char*)&ino, sizeof(inodeno_t));
+    ::_encode(dn, payload);
+    ::_encode(path, payload);
+    ::_encode(data, payload);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMDSBoot.h b/branches/sage/cephmds2/messages/MMDSBoot.h
new file mode 100644
index 0000000000000..c0c554152cc87
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMDSBoot.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMDSBOOT_H
+#define __MMDSBOOT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMDSBoot : public Message {
+ public:
+  MMDSBoot() : Message(MSG_MDS_BOOT) {
+  }
+
+  char *get_type_name() { return "mdsboot"; }
+  
+  void encode_payload() {
+    //payload.append((char*)&sb, sizeof(sb));
+  }
+  void decode_payload() {
+    //int off = 0;
+    //payload.copy(off, sizeof(sb), (char*)&sb);
+    //off += sizeof(sb);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMDSGetMap.h b/branches/sage/cephmds2/messages/MMDSGetMap.h
new file mode 100644
index 0000000000000..6bb6b92c00ccd
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMDSGetMap.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMDSGETMAP_H
+#define __MMDSGETMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMDSGetMap : public Message {
+ public:
+  MMDSGetMap() : Message(MSG_MDS_GETMAP) {
+  }
+
+  char *get_type_name() { return "mdsgetmap"; }
+  
+  void encode_payload() {
+    //payload.append((char*)&sb, sizeof(sb));
+  }
+  void decode_payload() {
+    //int off = 0;
+    //payload.copy(off, sizeof(sb), (char*)&sb);
+    //off += sizeof(sb);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMDSMap.h b/branches/sage/cephmds2/messages/MMDSMap.h
new file mode 100644
index 0000000000000..c8dd60abcb331
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMDSMap.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MMDSMAP_H
+#define __MMDSMAP_H
+
+#include "msg/Message.h"
+#include "mds/MDSMap.h"
+
+
+class MMDSMap : public Message {
+ public:
+  map<epoch_t, bufferlist> maps;
+  map<epoch_t, bufferlist> incremental_maps;
+
+  epoch_t get_first() {
+    epoch_t e = 0;
+    map<epoch_t, bufferlist>::iterator i = maps.begin();
+    if (i != maps.end())  e = i->first;
+    i = incremental_maps.begin();    
+    if (i != incremental_maps.end() &&
+        (e == 0 || i->first < e)) e = i->first;
+    return e;
+  }
+  epoch_t get_last() {
+    epoch_t e = 0;
+    map<epoch_t, bufferlist>::reverse_iterator i = maps.rbegin();
+    if (i != maps.rend())  e = i->first;
+    i = incremental_maps.rbegin();    
+    if (i != incremental_maps.rend() &&
+        (e == 0 || i->first > e)) e = i->first;
+    return e;
+  }
+
+
+  MMDSMap() : 
+    Message(MSG_MDS_MAP) {}
+  MMDSMap(MDSMap *mm) :
+    Message(MSG_MDS_MAP) {
+    mm->encode(maps[mm->get_epoch()]);
+  }
+
+
+  // marshalling
+  virtual void decode_payload() {
+    int off = 0;
+    ::_decode(maps, payload, off);
+    ::_decode(incremental_maps, payload, off);
+  }
+  virtual void encode_payload() {
+    ::_encode(maps, payload);
+    ::_encode(incremental_maps, payload);
+  }
+
+  virtual char *get_type_name() { return "mdsmap"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonElectionAck.h b/branches/sage/cephmds2/messages/MMonElectionAck.h
new file mode 100644
index 0000000000000..dbfa30c9cb099
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonElectionAck.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MMONELECTIONACK_H
+#define __MMONELECTIONACK_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionAck : public Message {
+ public:
+  int q;
+  int refresh_num;
+
+  MMonElectionAck() {}
+  MMonElectionAck(int _q, int _n) :
+    Message(MSG_MON_ELECTION_ACK),
+    q(_q), refresh_num(_n) {}
+ 
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(q), (char*)&q);
+    off += sizeof(q);
+    payload.copy(off, sizeof(refresh_num), (char*)&refresh_num);
+    off += sizeof(refresh_num);
+  }
+  void encode_payload() {
+    payload.append((char*)&q, sizeof(q));
+    payload.append((char*)&refresh_num, sizeof(refresh_num));
+  }
+
+  virtual char *get_type_name() { return "MonElAck"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonElectionCollect.h b/branches/sage/cephmds2/messages/MMonElectionCollect.h
new file mode 100644
index 0000000000000..d91870dfce5c6
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonElectionCollect.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MMONELECTIONCOLLECT_H
+#define __MMONELECTIONCOLLECT_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionCollect : public Message {
+ public:
+  int read_num;
+
+  MMonElectionCollect() {}
+  MMonElectionCollect(int n) :
+    Message(MSG_MON_ELECTION_COLLECT),
+    read_num(n) {}
+ 
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(read_num), (char*)&read_num);
+    off += sizeof(read_num);
+  }
+  void encode_payload() {
+    payload.append((char*)&read_num, sizeof(read_num));
+  }
+
+  virtual char *get_type_name() { return "MonElCollect"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonElectionRefresh.h b/branches/sage/cephmds2/messages/MMonElectionRefresh.h
new file mode 100644
index 0000000000000..497276f06b12f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonElectionRefresh.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MMONELECTIONREFRESH_H
+#define __MMONELECTIONREFRESH_H
+
+#include "msg/Message.h"
+
+#include "mon/Elector.h"
+
+class MMonElectionRefresh : public Message {
+ public:
+  int p;
+  Elector::State state;
+  int refresh_num;
+
+  MMonElectionRefresh() {}
+  MMonElectionRefresh(int _p, Elector::State& s, int r) :
+    Message(MSG_MON_ELECTION_REFRESH),
+    p(_p), state(s), refresh_num(r) {}
+ 
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(p), (char*)&p);
+    off += sizeof(p);
+    payload.copy(off, sizeof(state), (char*)&state);
+    off += sizeof(state);
+    payload.copy(off, sizeof(refresh_num), (char*)&refresh_num);
+    off += sizeof(refresh_num);
+  }
+  void encode_payload() {
+    payload.append((char*)&p, sizeof(p));
+    payload.append((char*)&state, sizeof(state));
+    payload.append((char*)&refresh_num, sizeof(refresh_num));
+  }
+
+  virtual char *get_type_name() { return "MonElRefresh"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonElectionStatus.h b/branches/sage/cephmds2/messages/MMonElectionStatus.h
new file mode 100644
index 0000000000000..071d0fcc82e0a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonElectionStatus.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MMONELECTIONSTATUS_H
+#define __MMONELECTIONSTATUS_H
+
+#include "msg/Message.h"
+
+#include "mon/Elector.h"
+
+class MMonElectionStatus : public Message {
+ public:
+  int q;
+  int read_num;
+  map<int,Elector::State> registry;
+
+  MMonElectionStatus() {}
+  MMonElectionStatus(int _q, int r, map<int,Elector::State> reg) :
+    Message(MSG_MON_ELECTION_STATUS),
+    q(_q), read_num(r), registry(reg) {}
+ 
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(q), (char*)&q);
+    off += sizeof(q);
+    payload.copy(off, sizeof(read_num), (char*)&read_num);
+    off += sizeof(read_num);
+    ::_decode(registry, payload, off);
+  }
+  void encode_payload() {
+    payload.append((char*)&q, sizeof(q));
+    payload.append((char*)&read_num, sizeof(read_num));
+    ::_encode(registry, payload);
+  }
+
+  virtual char *get_type_name() { return "MonElStatus"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapInfo.h b/branches/sage/cephmds2/messages/MMonOSDMapInfo.h
new file mode 100644
index 0000000000000..182b36f0a57cf
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapInfo.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPINFO_H
+#define __MMONOSDMAPINFO_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapInfo : public Message {
+ public:
+  epoch_t epoch;
+  epoch_t mon_epoch;
+
+  epoch_t get_epoch() { return epoch; }
+  epoch_t get_mon_epoch() { return mon_epoch; }
+  
+  MMonOSDMapInfo(epoch_t e, epoch_t me) :
+    Message(MSG_MON_OSDMAP_UPDATE_PREPARE),
+    epoch(e), mon_epoch(me) {
+  }
+  
+  char *get_type_name() { return "omap_info"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    payload.append((char*)&mon_epoch, sizeof(mon_epoch));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+    off += sizeof(mon_epoch);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapLease.h b/branches/sage/cephmds2/messages/MMonOSDMapLease.h
new file mode 100644
index 0000000000000..c6112bd898cae
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapLease.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPLEASE_H
+#define __MMONOSDMAPLEASE_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapLease : public Message {
+  epoch_t epoch;
+  utime_t lease_expire;
+
+ public:
+  epoch_t get_epoch() { return epoch; }
+  const utime_t& get_lease_expire() { return lease_expire; }
+
+  MMonOSDMapLease(epoch_t e, utime_t le) :
+    Message(MSG_MON_OSDMAP_LEASE),
+    epoch(e), lease_expire(le) {
+  }
+  
+  char *get_type_name() { return "omap_lease"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    payload.append((char*)&lease_expire, sizeof(lease_expire));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    payload.copy(off, sizeof(lease_expire), (char*)&lease_expire);
+    off += sizeof(lease_expire);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h b/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h
new file mode 100644
index 0000000000000..85d5ea7c02809
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPLEASEACK_H
+#define __MMONOSDMAPLEASEACK_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapLeaseAck : public Message {
+  epoch_t epoch;
+
+public:
+  epoch_t get_epoch() { return epoch; }
+  
+  MMonOSDMapLeaseAck(epoch_t e) :
+    Message(MSG_MON_OSDMAP_LEASE_ACK),
+    epoch(e) {
+  }
+  
+  char *get_type_name() { return "omap_lease_ack"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h
new file mode 100644
index 0000000000000..8673788f0632f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPUPDATEACK_H
+#define __MMONOSDMAPUPDATEACK_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdateAck : public Message {
+public:
+  epoch_t epoch;
+  
+  MMonOSDMapUpdateAck(epoch_t e) :
+    Message(MSG_MON_OSDMAP_UPDATE_ACK),
+    epoch(e) {
+  }
+  
+  char *get_type_name() { return "omap_update_ack"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h
new file mode 100644
index 0000000000000..6f12a8e3c784d
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPUPDATECOMMIT_H
+#define __MMONOSDMAPUPDATECOMMIT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdateCommit : public Message {
+ public:
+  epoch_t epoch;
+
+  MMonOSDMapUpdateCommit(epoch_t e) :
+    Message(MSG_MON_OSDMAP_UPDATE_COMMIT),
+    epoch(e) {
+  }
+  
+  char *get_type_name() { return "omap_update_commit"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h
new file mode 100644
index 0000000000000..bc962ea2b3eb2
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MMONOSDMAPUPDATEPREPARE_H
+#define __MMONOSDMAPUPDATEPREPARE_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdatePrepare : public Message {
+ public:
+  epoch_t epoch;
+  bufferlist map_bl;
+  bufferlist inc_map_bl;
+
+  epoch_t get_epoch() { return epoch; }
+
+  MMonOSDMapUpdatePrepare(epoch_t e, 
+			  bufferlist& mbl, bufferlist& incmbl) : 
+    Message(MSG_MON_OSDMAP_UPDATE_PREPARE),
+    epoch(e), 
+    map_bl(mbl), inc_map_bl(incmbl) {
+  }
+  
+  char *get_type_name() { return "omap_update_prepare"; }
+  
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    ::_encode(map_bl, payload);
+    ::_encode(inc_map_bl, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    ::_decode(map_bl, payload, off);
+    ::_decode(inc_map_bl, payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MNSConnect.h b/branches/sage/cephmds2/messages/MNSConnect.h
new file mode 100644
index 0000000000000..28150f79d8476
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSConnect.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSCONNECT_H
+#define __MNSCONNECT_H
+
+#include "msg/Message.h"
+#include "msg/tcp.h"
+
+class MNSConnect : public Message {
+  tcpaddr_t tcpaddr;
+
+ public:
+  MNSConnect() {}
+  MNSConnect(tcpaddr_t t) :
+    Message(MSG_NS_CONNECT) { 
+    tcpaddr = t;
+  }
+  
+  char *get_type_name() { return "NSCon"; }
+
+  tcpaddr_t& get_addr() { return tcpaddr; }
+
+  void encode_payload() {
+    payload.append((char*)&tcpaddr, sizeof(tcpaddr));
+  }
+  void decode_payload() {
+    payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSConnectAck.h b/branches/sage/cephmds2/messages/MNSConnectAck.h
new file mode 100644
index 0000000000000..696b13f2a41e6
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSConnectAck.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSCONNECTACK_H
+#define __MNSCONNECTACK_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSConnectAck : public Message {
+  int rank;
+  int inst;
+
+ public:
+  MNSConnectAck() {}
+  MNSConnectAck(int r, int g=0) : 
+    Message(MSG_NS_CONNECTACK) { 
+    rank = r;
+    inst = g;
+  }
+  
+  char *get_type_name() { return "NSConA"; }
+
+  int get_rank() { return rank; }
+  int get_inst() { return inst; }
+
+  void encode_payload() {
+    payload.append((char*)&rank, sizeof(rank));
+    payload.append((char*)&inst, sizeof(inst));
+  }
+  void decode_payload() {
+    unsigned off = 0;
+    payload.copy(off, sizeof(rank), (char*)&rank);
+    off += sizeof(rank);
+    payload.copy(off, sizeof(inst), (char*)&inst);
+    off += sizeof(inst);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSFailure.h b/branches/sage/cephmds2/messages/MNSFailure.h
new file mode 100644
index 0000000000000..405bfcfd2dacb
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSFailure.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSFAILURE_H
+#define __MNSFAILURE_H
+
+#include "msg/Message.h"
+#include "msg/tcp.h"
+
+class MNSFailure : public Message {
+  //msg_addr_t    entity;
+  entity_inst_t inst;
+
+ public:
+  MNSFailure() {}
+  MNSFailure(entity_inst_t& i) :
+    Message(MSG_NS_FAILURE),
+    //entity(w), 
+    inst(i) {}
+  
+  char *get_type_name() { return "NSFail"; }
+
+  //msg_addr_t &get_entity() { return entity; }
+  entity_inst_t &get_inst() { return inst; }
+
+  void encode_payload() {
+    //payload.append((char*)&entity, sizeof(entity));
+    payload.append((char*)&inst, sizeof(inst));
+  }
+  void decode_payload() {
+    unsigned off = 0;
+    //payload.copy(off, sizeof(entity), (char*)&entity);
+    //off += sizeof(entity);
+    payload.copy(off, sizeof(inst), (char*)&inst);
+    off += sizeof(inst);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSLookup.h b/branches/sage/cephmds2/messages/MNSLookup.h
new file mode 100644
index 0000000000000..cbea43092908a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSLookup.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSLOOKUP_H
+#define __MNSLOOKUP_H
+
+#include "msg/Message.h"
+
+class MNSLookup : public Message {
+  msg_addr_t entity;
+
+ public:
+  MNSLookup() {}
+  MNSLookup(msg_addr_t e) :
+    Message(MSG_NS_LOOKUP) {
+    entity = e;
+  }
+  
+  char *get_type_name() { return "NSLook"; }
+
+  msg_addr_t get_entity() { return entity; }
+
+  void encode_payload() {
+    payload.append((char*)&entity, sizeof(entity));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(entity), (char*)&entity);
+    off += sizeof(entity);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSLookupReply.h b/branches/sage/cephmds2/messages/MNSLookupReply.h
new file mode 100644
index 0000000000000..e87b48435c92a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSLookupReply.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSLOOKUPREPLY_H
+#define __MNSLOOKUPREPLY_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSLookupReply : public Message {
+ public:
+  map<msg_addr_t, entity_inst_t> entity_map;  
+
+ public:
+  MNSLookupReply() {}
+  MNSLookupReply(MNSLookup *m) : 
+    Message(MSG_NS_LOOKUPREPLY) { 
+  }
+  
+  char *get_type_name() { return "NSLookR"; }
+
+  void encode_payload() {
+    ::_encode(entity_map, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    ::_decode(entity_map, payload, off);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSRegister.h b/branches/sage/cephmds2/messages/MNSRegister.h
new file mode 100644
index 0000000000000..9af0dd15aa1dc
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSRegister.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSREGISTER_H
+#define __MNSREGISTER_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSRegister : public Message {
+  msg_addr_t addr;
+  int rank;
+  long tid;
+
+ public:
+  MNSRegister() {}
+  MNSRegister(msg_addr_t a, int r, int ti) : 
+    Message(MSG_NS_REGISTER) { 
+    addr = a;
+    rank = r;
+    tid = ti;
+  }
+  
+  char *get_type_name() { return "NSReg"; }
+
+  msg_addr_t get_entity() { return addr; }
+  int get_rank() { return rank; }
+  long get_tid() { return tid; }
+
+  void encode_payload() {
+    payload.append((char*)&addr, sizeof(addr));
+    payload.append((char*)&rank, sizeof(rank));
+    payload.append((char*)&tid, sizeof(tid));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(addr), (char*)&addr);
+    off += sizeof(addr);
+    payload.copy(off, sizeof(rank), (char*)&rank);
+    off += sizeof(rank);
+    payload.copy(off, sizeof(tid), (char*)&tid);
+    off += sizeof(tid);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MNSRegisterAck.h b/branches/sage/cephmds2/messages/MNSRegisterAck.h
new file mode 100644
index 0000000000000..54e4b93db2118
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MNSRegisterAck.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MNSREGISTERACK_H
+#define __MNSREGISTERACK_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSRegisterAck : public Message {
+  msg_addr_t entity;
+  long tid;
+
+ public:
+  MNSRegisterAck() {}
+  MNSRegisterAck(long t, msg_addr_t e) : 
+    Message(MSG_NS_REGISTERACK) { 
+    entity = e;
+    tid = t;
+  }
+  
+  char *get_type_name() { return "NSRegA"; }
+
+  msg_addr_t get_entity() { return entity; }
+  long get_tid() { return tid; }
+
+  void encode_payload() {
+    payload.append((char*)&entity, sizeof(entity));
+    payload.append((char*)&tid, sizeof(tid));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(entity), (char*)&entity);
+    off += sizeof(entity);
+    payload.copy(off, sizeof(tid), (char*)&tid);
+    off += sizeof(tid);
+  }
+};
+
+
+#endif
+
diff --git a/branches/sage/cephmds2/messages/MOSDBoot.h b/branches/sage/cephmds2/messages/MOSDBoot.h
new file mode 100644
index 0000000000000..17604282b0635
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDBoot.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MOSDBOOT_H
+#define __MOSDBOOT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MOSDBoot : public Message {
+ public:
+  OSDSuperblock sb;
+
+  MOSDBoot() {}
+  MOSDBoot(OSDSuperblock& s) : 
+    Message(MSG_OSD_BOOT),
+    sb(s) {
+  }
+
+  char *get_type_name() { return "oboot"; }
+  
+  void encode_payload() {
+    payload.append((char*)&sb, sizeof(sb));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(sb), (char*)&sb);
+    off += sizeof(sb);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDFailure.h b/branches/sage/cephmds2/messages/MOSDFailure.h
new file mode 100644
index 0000000000000..7dd75758ff0d6
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDFailure.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDFAILURE_H
+#define __MOSDFAILURE_H
+
+#include "msg/Message.h"
+
+
+class MOSDFailure : public Message {
+ public:
+  msg_addr_t    failed;
+  entity_inst_t inst;
+  epoch_t       epoch;
+
+  MOSDFailure() {}
+  MOSDFailure(msg_addr_t f, const entity_inst_t& i, epoch_t e) : 
+    Message(MSG_OSD_FAILURE),
+    failed(f), inst(i), epoch(e) {}
+ 
+  msg_addr_t get_failed() { return failed; }
+  entity_inst_t& get_inst() { return inst; }
+  epoch_t get_epoch() { return epoch; }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(failed), (char*)&failed);
+    off += sizeof(failed);
+    payload.copy(off, sizeof(inst), (char*)&inst);
+    off += sizeof(inst);
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+  }
+  void encode_payload() {
+    payload.append((char*)&failed, sizeof(failed));
+    payload.append((char*)&inst, sizeof(inst));
+    payload.append((char*)&epoch, sizeof(epoch));
+  }
+
+  virtual char *get_type_name() { return "osdfail"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDGetMap.h b/branches/sage/cephmds2/messages/MOSDGetMap.h
new file mode 100644
index 0000000000000..58afd527bda93
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDGetMap.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MOSDGETMAP_H
+#define __MOSDGETMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MOSDGetMap : public Message {
+ public:
+  epoch_t since;
+
+  //MOSDGetMap() : since(0) {}
+  MOSDGetMap(epoch_t s=0) : 
+    Message(MSG_OSD_GETMAP),
+    since(s) {
+  }
+
+  epoch_t get_since() { return since; }
+
+  char *get_type_name() { return "getomap"; }
+  
+  void encode_payload() {
+    payload.append((char*)&since, sizeof(since));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(since), (char*)&since);
+    off += sizeof(since);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDIn.h b/branches/sage/cephmds2/messages/MOSDIn.h
new file mode 100644
index 0000000000000..276a930d2e00b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDIn.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MOSDIN_H
+#define __MOSDIN_H
+
+#include "msg/Message.h"
+
+
+class MOSDIn : public Message {
+ public:
+  epoch_t map_epoch;
+
+  MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) {
+  }
+  MOSDIn() {}
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+    off += sizeof(map_epoch);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&map_epoch, sizeof(map_epoch));
+  }
+
+  virtual char *get_type_name() { return "oin"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDMap.h b/branches/sage/cephmds2/messages/MOSDMap.h
new file mode 100644
index 0000000000000..dd231a831d63d
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDMap.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDGETMAPACK_H
+#define __MOSDGETMAPACK_H
+
+#include "msg/Message.h"
+#include "osd/OSDMap.h"
+
+
+class MOSDMap : public Message {
+ public:
+  map<epoch_t, bufferlist> maps;
+  map<epoch_t, bufferlist> incremental_maps;
+
+  epoch_t get_first() {
+    epoch_t e = 0;
+    map<epoch_t, bufferlist>::iterator i = maps.begin();
+    if (i != maps.end())  e = i->first;
+    i = incremental_maps.begin();    
+    if (i != incremental_maps.end() &&
+        (e == 0 || i->first < e)) e = i->first;
+    return e;
+  }
+  epoch_t get_last() {
+    epoch_t e = 0;
+    map<epoch_t, bufferlist>::reverse_iterator i = maps.rbegin();
+    if (i != maps.rend())  e = i->first;
+    i = incremental_maps.rbegin();    
+    if (i != incremental_maps.rend() &&
+        (e == 0 || i->first > e)) e = i->first;
+    return e;
+  }
+
+
+  MOSDMap() : 
+    Message(MSG_OSD_MAP) {}
+  MOSDMap(OSDMap *oc) :
+    Message(MSG_OSD_MAP) {
+    oc->encode(maps[oc->get_epoch()]);
+  }
+
+
+  // marshalling
+  virtual void decode_payload() {
+    int off = 0;
+    ::_decode(maps, payload, off);
+    ::_decode(incremental_maps, payload, off);
+  }
+  virtual void encode_payload() {
+    ::_encode(maps, payload);
+    ::_encode(incremental_maps, payload);
+  }
+
+  virtual char *get_type_name() { return "omap"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDOp.h b/branches/sage/cephmds2/messages/MOSDOp.h
new file mode 100644
index 0000000000000..1297c764402d2
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDOp.h
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDOP_H
+#define __MOSDOP_H
+
+#include "msg/Message.h"
+
+/*
+ * OSD op
+ *
+ * oid - object id
+ * op  - OSD_OP_DELETE, etc.
+ *
+ */
+
+//#define OSD_OP_MKFS       20
+
+// client ops
+#define OSD_OP_READ       1
+#define OSD_OP_STAT       2
+
+#define OSD_OP_WRNOOP     10
+#define OSD_OP_WRITE      11
+#define OSD_OP_DELETE     12
+#define OSD_OP_TRUNCATE   13
+#define OSD_OP_ZERO       14
+
+#define OSD_OP_WRLOCK     20
+#define OSD_OP_WRUNLOCK   21
+#define OSD_OP_RDLOCK     22
+#define OSD_OP_RDUNLOCK   23
+#define OSD_OP_UPLOCK     24
+#define OSD_OP_DNLOCK     25
+
+#define OSD_OP_PULL       30
+#define OSD_OP_PUSH       31
+
+
+typedef struct {
+  long pcid;
+
+  // who's asking?
+  tid_t tid;
+  msg_addr_t client;
+  entity_inst_t client_inst;
+
+  // for replication
+  tid_t rep_tid;
+
+  object_t oid;
+  objectrev_t rev;
+  pg_t pg;
+
+  epoch_t map_epoch;
+
+  eversion_t pg_trim_to;   // primary->replica: trim to here
+
+  int op;
+  size_t length, offset;
+  eversion_t version;
+  eversion_t old_version;
+
+  bool   want_ack;
+  bool   want_commit;
+} MOSDOp_st;
+
+class MOSDOp : public Message {
+public:
+  static const char* get_opname(int op) {
+    switch (op) {
+    case OSD_OP_READ: return "read";
+    case OSD_OP_STAT: return "stat";
+
+    case OSD_OP_WRNOOP: return "wrnoop"; 
+    case OSD_OP_WRITE: return "write"; 
+    case OSD_OP_ZERO: return "zero"; 
+    case OSD_OP_DELETE: return "delete"; 
+    case OSD_OP_TRUNCATE: return "truncate"; 
+    case OSD_OP_WRLOCK: return "wrlock"; 
+    case OSD_OP_WRUNLOCK: return "wrunlock"; 
+    case OSD_OP_RDLOCK: return "rdlock"; 
+    case OSD_OP_RDUNLOCK: return "rdunlock"; 
+    case OSD_OP_UPLOCK: return "uplock"; 
+    case OSD_OP_DNLOCK: return "dnlock"; 
+
+    case OSD_OP_PULL: return "pull";
+    case OSD_OP_PUSH: return "push";
+    default: assert(0);
+    }
+    return 0;
+  }
+
+private:
+  MOSDOp_st st;
+  bufferlist data;
+  map<string,bufferptr> attrset;
+
+  friend class MOSDOpReply;
+
+ public:
+  const tid_t       get_tid() { return st.tid; }
+  const msg_addr_t& get_client() { return st.client; }
+  const entity_inst_t& get_client_inst() { return st.client_inst; }
+  void set_client_inst(const entity_inst_t& i) { st.client_inst = i; }
+
+  const tid_t       get_rep_tid() { return st.rep_tid; }
+  void set_rep_tid(tid_t t) { st.rep_tid = t; }
+
+  const object_t   get_oid() { return st.oid; }
+  const pg_t get_pg() { return st.pg; }
+  const epoch_t  get_map_epoch() { return st.map_epoch; }
+
+  //const int        get_pg_role() { return st.pg_role; }  // who am i asking for?
+  const eversion_t  get_version() { return st.version; }
+  //const eversion_t  get_old_version() { return st.old_version; }
+  
+  void set_rev(objectrev_t r) { st.rev = r; }
+  objectrev_t get_rev() { return st.rev; }
+
+  const eversion_t get_pg_trim_to() { return st.pg_trim_to; }
+  void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; }
+  
+  const int    get_op() { return st.op; }
+  void set_op(int o) { st.op = o; }
+
+  const size_t get_length() { return st.length; }
+  const size_t get_offset() { return st.offset; }
+
+  map<string,bufferptr>& get_attrset() { return attrset; }
+  void set_attrset(map<string,bufferptr> &as) { attrset = as; }
+
+  const bool wants_ack() { return st.want_ack; }
+  const bool wants_commit() { return st.want_commit; }
+
+  
+  void set_data(bufferlist &d) {
+    data.claim(d);
+  }
+  bufferlist& get_data() {
+    return data;
+  }
+  size_t get_data_len() { return data.length(); }
+
+
+  // keep a pcid (procedure call id) to match up request+reply
+  void set_pcid(long pcid) { this->st.pcid = pcid; }
+  long get_pcid() { return st.pcid; }
+
+  MOSDOp(long tid, msg_addr_t asker, 
+         object_t oid, pg_t pg, epoch_t mapepoch, int op) :
+    Message(MSG_OSD_OP) {
+    memset(&st, 0, sizeof(st));
+    this->st.client = asker;
+    this->st.tid = tid;
+    this->st.rep_tid = 0;
+
+    this->st.oid = oid;
+    this->st.pg = pg;
+    this->st.map_epoch = mapepoch;
+    this->st.op = op;
+
+    this->st.want_ack = true;
+    this->st.want_commit = true;
+  }
+  MOSDOp() {}
+
+  //void set_pg_role(int r) { st.pg_role = r; }
+  //void set_rg_nrep(int n) { st.rg_nrep = n; }
+
+  void set_length(size_t l) { st.length = l; }
+  void set_offset(size_t o) { st.offset = o; }
+  void set_version(eversion_t v) { st.version = v; }
+  void set_old_version(eversion_t ov) { st.old_version = ov; }
+  
+  void set_want_ack(bool b) { st.want_ack = b; }
+  void set_want_commit(bool b) { st.want_commit = b; }
+
+  // marshalling
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(st), (char*)&st);
+    off += sizeof(st);
+    ::_decode(attrset, payload, off);
+    ::_decode(data, payload, off);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&st, sizeof(st));
+    ::_encode(attrset, payload);
+    ::_encode(data, payload);
+  }
+
+  virtual char *get_type_name() { return "oop"; }
+};
+
+inline ostream& operator<<(ostream& out, MOSDOp& op)
+{
+  return out << "MOSDOp(" << MSG_ADDR_NICE(op.get_client()) << "." << op.get_tid() 
+             << " op " << MOSDOp::get_opname(op.get_op())
+             << " oid " << hex << op.get_oid() << dec << " " << &op << ")";
+}
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDOpReply.h b/branches/sage/cephmds2/messages/MOSDOpReply.h
new file mode 100644
index 0000000000000..35c6ad5898b0b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDOpReply.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDOPREPLY_H
+#define __MOSDOPREPLY_H
+
+#include "msg/Message.h"
+
+#include "MOSDOp.h"
+#include "osd/ObjectStore.h"
+
+/*
+ * OSD op reply
+ *
+ * oid - object id
+ * op  - OSD_OP_DELETE, etc.
+ *
+ */
+
+
+typedef struct {
+  // req
+  long pcid;
+  tid_t tid;
+  tid_t rep_tid;
+
+  object_t oid;
+  pg_t pg;
+
+  int op;
+  
+  // reply
+  int    result;
+  bool   commit;
+  size_t length, offset;
+  size_t object_size;
+  eversion_t version;
+
+  eversion_t pg_complete_thru;
+
+  epoch_t map_epoch;
+} MOSDOpReply_st;
+
+
+class MOSDOpReply : public Message {
+  MOSDOpReply_st st;
+  bufferlist data;
+  map<string,bufferptr> attrset;
+
+ public:
+  long     get_tid() { return st.tid; }
+  long     get_rep_tid() { return st.rep_tid; }
+  object_t get_oid() { return st.oid; }
+  pg_t     get_pg() { return st.pg; }
+  int      get_op()  { return st.op; }
+  bool     get_commit() { return st.commit; }
+  
+  int    get_result() { return st.result; }
+  size_t get_length() { return st.length; }
+  size_t get_offset() { return st.offset; }
+  size_t get_object_size() { return st.object_size; }
+  eversion_t get_version() { return st.version; }
+  map<string,bufferptr>& get_attrset() { return attrset; }
+
+  eversion_t get_pg_complete_thru() { return st.pg_complete_thru; }
+  void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; }
+
+  void set_result(int r) { st.result = r; }
+  void set_length(size_t s) { st.length = s; }
+  void set_offset(size_t o) { st.offset = o; }
+  void set_object_size(size_t s) { st.object_size = s; }
+  void set_version(eversion_t v) { st.version = v; }
+  void set_attrset(map<string,bufferptr> &as) { attrset = as; }
+
+  void set_op(int op) { st.op = op; }
+  void set_tid(tid_t t) { st.tid = t; }
+  void set_rep_tid(tid_t t) { st.rep_tid = t; }
+
+  // data payload
+  void set_data(bufferlist &d) {
+    data.claim(d);
+  }
+  bufferlist& get_data() {
+    return data;
+  }
+
+  // osdmap
+  epoch_t get_map_epoch() { return st.map_epoch; }
+
+  // keep a pcid (procedure call id) to match up request+reply
+  void set_pcid(long pcid) { this->st.pcid = pcid; }
+  long get_pcid()          { return st.pcid; }
+
+public:
+  MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) :
+    Message(MSG_OSD_OPREPLY) {
+    memset(&st, 0, sizeof(st));
+    this->st.pcid = req->st.pcid;
+
+    this->st.op = req->st.op;
+    this->st.tid = req->st.tid;
+    this->st.rep_tid = req->st.rep_tid;
+
+    this->st.oid = req->st.oid;
+    this->st.pg = req->st.pg;
+    this->st.result = result;
+    this->st.commit = commit;
+
+    this->st.length = req->st.length;   // speculative... OSD should ensure these are correct
+    this->st.offset = req->st.offset;
+    this->st.version = req->st.version;
+
+    this->st.map_epoch = e;
+  }
+  MOSDOpReply() {}
+
+
+  // marshalling
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(st), (char*)&st);
+    payload.splice(0, sizeof(st));
+    int off = 0;
+    ::_decode(attrset, payload, off);
+    ::_decode(data, payload, off);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&st, sizeof(st));
+    ::_encode(attrset, payload);
+    ::_encode(data, payload);
+  }
+
+  virtual char *get_type_name() { return "oopr"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDOut.h b/branches/sage/cephmds2/messages/MOSDOut.h
new file mode 100644
index 0000000000000..61a594de3294a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDOut.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MOSDOUT_H
+#define __MOSDOUT_H
+
+#include "msg/Message.h"
+
+
+class MOSDOut : public Message {
+ public:
+  epoch_t map_epoch;
+
+  MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) {
+  }
+  MOSDOut() {}
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+    off += sizeof(map_epoch);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&map_epoch, sizeof(map_epoch));
+  }
+
+  virtual char *get_type_name() { return "oout"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGLog.h b/branches/sage/cephmds2/messages/MOSDPGLog.h
new file mode 100644
index 0000000000000..e4731c6037107
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGLog.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGLOG_H
+#define __MOSDPGLOG_H
+
+#include "msg/Message.h"
+
+class MOSDPGLog : public Message {
+  epoch_t epoch;
+  pg_t    pgid;
+
+public:
+  PG::Info info;
+  PG::Log log;
+  PG::Missing missing;
+
+  epoch_t get_epoch() { return epoch; }
+  pg_t get_pgid() { return pgid; }
+
+  MOSDPGLog() {}
+  MOSDPGLog(version_t mv, pg_t pgid) :
+    Message(MSG_OSD_PG_LOG) {
+    this->epoch = mv;
+    this->pgid = pgid;
+  }
+
+  char *get_type_name() { return "PGlog"; }
+
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    payload.append((char*)&pgid, sizeof(pgid));
+    payload.append((char*)&info, sizeof(info));
+    log._encode(payload);
+    missing._encode(payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    payload.copy(off, sizeof(pgid), (char*)&pgid);
+    off += sizeof(pgid);
+    payload.copy(off, sizeof(info), (char*)&info);
+    off += sizeof(info);
+    log._decode(payload, off);
+    missing._decode(payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGNotify.h b/branches/sage/cephmds2/messages/MOSDPGNotify.h
new file mode 100644
index 0000000000000..f6fe8ee88c170
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGNotify.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MOSDPGPEERNOTIFY_H
+#define __MOSDPGPEERNOTIFY_H
+
+#include "msg/Message.h"
+
+#include "osd/PG.h"
+
+/*
+ * PGNotify - notify primary of my PGs and versions.
+ */
+
+class MOSDPGNotify : public Message {
+  epoch_t      epoch;
+  list<PG::Info> pg_list;   // pgid -> version
+
+ public:
+  version_t get_epoch() { return epoch; }
+  list<PG::Info>& get_pg_list() { return pg_list; }
+
+  MOSDPGNotify() {}
+  MOSDPGNotify(epoch_t e, list<PG::Info>& l) :
+    Message(MSG_OSD_PG_NOTIFY) {
+    this->epoch = e;
+    pg_list.splice(pg_list.begin(),l);
+  }
+  
+  char *get_type_name() { return "PGnot"; }
+
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    _encode(pg_list, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    _decode(pg_list, payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGPeer.h b/branches/sage/cephmds2/messages/MOSDPGPeer.h
new file mode 100644
index 0000000000000..ebe1cda485c4c
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGPeer.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGPEER_H
+#define __MOSDPGPEER_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGPeer : public Message {
+  __uint64_t       map_version;
+  list<pg_t> pg_list;
+
+  bool complete;
+
+ public:
+  __uint64_t get_version() { return map_version; }
+  list<pg_t>& get_pg_list() { return pg_list; }
+  bool get_complete() { return complete; }
+
+  MOSDPGPeer() {}
+  MOSDPGPeer(__uint64_t v, list<pg_t>& l, bool c=false) :
+    Message(MSG_OSD_PG_PEER) {
+    this->map_version = v;
+    this->complete = c;
+    pg_list.splice(pg_list.begin(), l);
+  }
+  
+  char *get_type_name() { return "PGPeer"; }
+
+  void encode_payload() {
+    payload.append((char*)&map_version, sizeof(map_version));
+    payload.append((char*)&complete, sizeof(complete));
+    _encode(pg_list, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_version), (char*)&map_version);
+    off += sizeof(map_version);
+    payload.copy(off, sizeof(complete), (char*)&complete);
+    off += sizeof(complete);
+    _decode(pg_list, payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGPeerAck.h b/branches/sage/cephmds2/messages/MOSDPGPeerAck.h
new file mode 100644
index 0000000000000..e21a2607bb573
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGPeerAck.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGPEERACK_H
+#define __MOSDPGPEERACK_H
+
+#include "msg/Message.h"
+#include "osd/OSD.h"
+
+class MOSDPGPeerAck : public Message {
+  __uint64_t       map_version;
+
+ public:
+  list<pg_t>                pg_dne;   // pg dne
+  map<pg_t, PGReplicaInfo > pg_state; // state, lists, etc.
+
+  __uint64_t get_version() { return map_version; }
+
+  MOSDPGPeerAck() {}
+  MOSDPGPeerAck(__uint64_t v) :
+    Message(MSG_OSD_PG_PEERACK) {
+    this->map_version = v;
+  }
+  
+  char *get_type_name() { return "PGPeer"; }
+
+  void encode_payload() {
+    payload.append((char*)&map_version, sizeof(map_version));
+    _encode(pg_dne, payload);
+    
+    int n = pg_state.size();
+    payload.append((char*)&n, sizeof(n));
+    for (map<pg_t, PGReplicaInfo >::iterator it = pg_state.begin();
+         it != pg_state.end();
+         it++) {
+      payload.append((char*)&it->first, sizeof(it->first));
+      it->second._encode(payload);
+    }
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_version), (char*)&map_version);
+    off += sizeof(map_version);
+    _decode(pg_dne, payload, off);
+
+    int n;
+    payload.copy(off, sizeof(n), (char*)&n);
+    off += sizeof(n);
+    for (int i=0; i<n; i++) {
+      pg_t pgid;
+      payload.copy(off, sizeof(pgid), (char*)&pgid);
+      off += sizeof(pgid);
+      pg_state[pgid]._decode(payload, off);
+    }
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGPeerRequest.h b/branches/sage/cephmds2/messages/MOSDPGPeerRequest.h
new file mode 100644
index 0000000000000..a65d2ccc2168f
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGPeerRequest.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPEERREQUEST_H
+#define __MOSDPEERREQUEST_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGPeerRequest : public Message {
+  __uint64_t       map_version;
+  list<repgroup_t> pg_list;
+
+ public:
+  __uint64_t get_version() { return map_version; }
+  list<repgroup_t>& get_pg_list() { return pg_list; }
+
+  MOSDPGPeerRequest() {}
+  MOSDPGPeerRequest(__uint64_t v, list<repgroup_t>& l) :
+    Message(MSG_OSD_PG_PEERREQUEST) {
+    this->map_version = v;
+    pg_list.splice(pg_list.begin(), l);
+  }
+  
+  char *get_type_name() { return "PGPR"; }
+
+  void encode_payload() {
+    payload.append((char*)&map_version, sizeof(map_version));
+    _encode(pg_list, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_version), (char*)&map_version);
+    off += sizeof(map_version);
+    _decode(pg_list, payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGQuery.h b/branches/sage/cephmds2/messages/MOSDPGQuery.h
new file mode 100644
index 0000000000000..926acce81349d
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGQuery.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGQUERY_H
+#define __MOSDPGQUERY_H
+
+#include "msg/Message.h"
+
+/*
+ * PGQuery - query another OSD as to the contents of their PGs
+ */
+
+class MOSDPGQuery : public Message {
+  version_t       epoch;
+
+ public:
+  version_t get_epoch() { return epoch; }
+  map<pg_t,PG::Query>  pg_list;
+
+  MOSDPGQuery() {}
+  MOSDPGQuery(epoch_t e, map<pg_t,PG::Query>& ls) :
+    Message(MSG_OSD_PG_QUERY),
+    epoch(e), pg_list(ls) {
+  }
+  
+  char *get_type_name() { return "PGq"; }
+
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    ::_encode(pg_list, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    ::_decode(pg_list, payload, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGRemove.h b/branches/sage/cephmds2/messages/MOSDPGRemove.h
new file mode 100644
index 0000000000000..9629a3782764b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGRemove.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGREMOVE_H
+#define __MOSDPGREMOVE_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGRemove : public Message {
+  epoch_t epoch;
+
+ public:
+  set<pg_t> pg_list;
+
+  epoch_t get_epoch() { return epoch; }
+
+  MOSDPGRemove() {}
+  MOSDPGRemove(epoch_t e, set<pg_t>& l) :
+    Message(MSG_OSD_PG_REMOVE) {
+    this->epoch = e;
+    pg_list = l;
+  }
+  
+  char *get_type_name() { return "PGrm"; }
+
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    _encode(pg_list, payload);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    _decode(pg_list, payload, off);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGSummary.h b/branches/sage/cephmds2/messages/MOSDPGSummary.h
new file mode 100644
index 0000000000000..dc4af837209bb
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGSummary.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGQUERYREPLY_H
+#define __MOSDPGQUERYREPLY_H
+
+#include "msg/Message.h"
+
+class MOSDPGSummary : public Message {
+  epoch_t epoch;
+  pg_t pgid;
+
+public:
+  PG::PGInfo info;
+  bufferlist    sumbl;
+
+  epoch_t get_epoch() { return epoch; }
+
+  MOSDPGSummary() {}
+  MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) :
+    Message(MSG_OSD_PG_SUMMARY) {
+    this->epoch = mv;
+    this->pgid = pgid;
+    summary._encode(sumbl);
+  }
+
+  pg_t get_pgid() { return pgid; }
+  bufferlist& get_summary_bl() {
+    return sumbl;
+  }
+  
+  char *get_type_name() { return "PGsum"; }
+
+  void encode_payload() {
+    payload.append((char*)&epoch, sizeof(epoch));
+    payload.append((char*)&pgid, sizeof(pgid));
+    payload.append((char*)&info, sizeof(info));
+    payload.claim_append(sumbl);
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    payload.copy(off, sizeof(pgid), (char*)&pgid);
+    off += sizeof(pgid);
+    payload.copy(off, sizeof(info), (char*)&info);
+    off += sizeof(info);
+
+    payload.splice(0, off);
+    sumbl.claim(payload);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPGUpdate.h b/branches/sage/cephmds2/messages/MOSDPGUpdate.h
new file mode 100644
index 0000000000000..93809d6820d21
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPGUpdate.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MOSDPGUPDATE_H
+#define __MOSDPGUPDATE_H
+
+#include "msg/Message.h"
+
+class MOSDPGUpdate : public Message {
+  version_t   map_version;
+  pg_t        pgid;
+  //pginfo_t    info;
+  bool        complete;
+  version_t   last_any_complete;
+
+ public:
+  version_t get_version() { return map_version; }
+  pg_t get_pgid() { return pgid; }
+  //pginfo_t& get_pginfo() { return info; }
+  bool is_complete() { return complete; }
+  version_t get_last_any_complete() { return last_any_complete; }
+
+  MOSDPGUpdate() {}
+  MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) :
+    Message(MSG_OSD_PG_UPDATE) {
+    this->map_version = mv;
+    this->pgid = pgid;
+    this->complete = complete;
+    this->last_any_complete = last_any_complete;
+  }
+  
+  char *get_type_name() { return "PGUp"; }
+
+  void encode_payload() {
+    payload.append((char*)&map_version, sizeof(map_version));
+    payload.append((char*)&pgid, sizeof(pgid));
+    payload.append((char*)&complete, sizeof(complete));
+    payload.append((char*)&last_any_complete, sizeof(last_any_complete));
+  }
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_version), (char*)&map_version);
+    off += sizeof(map_version);
+    payload.copy(off, sizeof(pgid), (char*)&pgid);
+    off += sizeof(pgid);
+    payload.copy(off, sizeof(complete), (char*)&complete);
+    off += sizeof(complete);
+    payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete);
+    off += sizeof(last_any_complete);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MOSDPing.h b/branches/sage/cephmds2/messages/MOSDPing.h
new file mode 100644
index 0000000000000..fae80edd91cfc
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MOSDPing.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MOSDPING_H
+#define __MOSDPING_H
+
+#include "msg/Message.h"
+
+
+class MOSDPing : public Message {
+ public:
+  epoch_t map_epoch;
+  bool ack;
+  float avg_qlen;
+
+  MOSDPing(epoch_t e, 
+	   float aq,
+	   bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq) {
+  }
+  MOSDPing() {}
+
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+    off += sizeof(map_epoch);
+    payload.copy(off, sizeof(ack), (char*)&ack);
+    off += sizeof(ack);
+    payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen);
+    off += sizeof(avg_qlen);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&map_epoch, sizeof(map_epoch));
+    payload.append((char*)&ack, sizeof(ack));
+    payload.append((char*)&avg_qlen, sizeof(avg_qlen));
+  }
+
+  virtual char *get_type_name() { return "oping"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MPing.h b/branches/sage/cephmds2/messages/MPing.h
new file mode 100644
index 0000000000000..65b65a738cd66
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MPing.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MPING_H
+#define __MPING_H
+
+#include "msg/Message.h"
+
+
+class MPing : public Message {
+ public:
+  int seq;
+  MPing(int s) : Message(MSG_PING) {
+    seq = s;
+  }
+  MPing() : Message(MSG_PING) {}
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(0, sizeof(seq), (char*)&seq);
+    off += sizeof(seq);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&seq, sizeof(seq));
+  }
+
+  virtual char *get_type_name() { return "ping"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MPingAck.h b/branches/sage/cephmds2/messages/MPingAck.h
new file mode 100644
index 0000000000000..0ee385b7a2b80
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MPingAck.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MPINGACK_H
+#define __MPINGACK_H
+
+#include "MPing.h"
+
+
+class MPingAck : public Message {
+ public:
+  int seq;
+  MPingAck() {}
+  MPingAck(MPing *p) : Message(MSG_PING_ACK) {
+    this->seq = p->seq;
+  }
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(0, sizeof(seq), (char*)&seq);
+    off += sizeof(seq);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&seq, sizeof(seq));
+  }
+
+  virtual char *get_type_name() { return "pinga"; }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRename.h b/branches/sage/cephmds2/messages/MRename.h
new file mode 100644
index 0000000000000..e648f3e652fc7
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRename.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAME_H
+#define __MRENAME_H
+
+class MRename : public Message {
+  inodeno_t srcdirino;
+  string srcname;
+  inodeno_t destdirino;
+  string destname;
+  int initiator;
+
+  bufferlist inode_state;
+
+ public:
+  int get_initiator() { return initiator; }
+  inodeno_t get_srcdirino() { return srcdirino; }
+  string& get_srcname() { return srcname; }
+  inodeno_t get_destdirino() { return destdirino; }
+  string& get_destname() { return destname; }
+  bufferlist& get_inode_state() { return inode_state; }
+
+  MRename() {}
+  MRename(int initiator,
+          inodeno_t srcdirino,
+          const string& srcname,
+          inodeno_t destdirino,
+          const string& destname,
+          bufferlist& inode_state) :
+    Message(MSG_MDS_RENAME) {
+    this->initiator = initiator;
+    this->srcdirino = srcdirino;
+    this->srcname = srcname;
+    this->destdirino = destdirino;
+    this->destname = destname;
+    this->inode_state.claim( inode_state );
+  }
+  virtual char *get_type_name() { return "Rn";}
+  
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(initiator), (char*)&initiator);
+    off += sizeof(initiator);
+    payload.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+    off += sizeof(srcdirino);
+    payload.copy(off, sizeof(destdirino), (char*)&destdirino);
+    off += sizeof(destdirino);
+    _decode(srcname, payload, off);
+    _decode(destname, payload, off);
+    size_t len;
+    payload.copy(off, sizeof(len), (char*)&len);
+    off += sizeof(len);
+    inode_state.substr_of(payload, off, len);
+    off += len;
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&initiator,sizeof(initiator));
+    payload.append((char*)&srcdirino,sizeof(srcdirino));
+    payload.append((char*)&destdirino,sizeof(destdirino));
+    _encode(srcname, payload);
+    _encode(destname, payload);
+    size_t len = inode_state.length();
+    payload.append((char*)&len, sizeof(len));
+    payload.claim_append(inode_state);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenameAck.h b/branches/sage/cephmds2/messages/MRenameAck.h
new file mode 100644
index 0000000000000..14843cef5f616
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenameAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMEACK_H
+#define __MRENAMEACK_H
+
+/* FIXME: relateive to dn, not inode */
+
+class MRenameAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+
+  MRenameAck() {}
+  MRenameAck(inodeno_t ino) :
+    Message(MSG_MDS_RENAMEACK) {
+    this->ino = ino;
+  }
+  virtual char *get_type_name() { return "RnAck";}
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino,sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenameNotify.h b/branches/sage/cephmds2/messages/MRenameNotify.h
new file mode 100644
index 0000000000000..bc32300b82e3a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenameNotify.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMENOTIFY_H
+#define __MRENAMENOTIFY_H
+
+class MRenameNotify : public Message {
+  inodeno_t ino;
+  inodeno_t srcdirino;
+  string srcname;
+  inodeno_t destdirino;
+  string destname;
+  string destdirpath;
+  int srcauth;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  inodeno_t get_srcdirino() { return srcdirino; }
+  string& get_srcname() { return srcname; }
+  inodeno_t get_destdirino() { return destdirino; }
+  string& get_destname() { return destname; }
+  string& get_destdirpath() { return destdirpath; }
+  int get_srcauth() { return srcauth; }
+
+  MRenameNotify() {}
+  MRenameNotify(inodeno_t ino,
+                inodeno_t srcdirino,
+                const string& srcname,
+                inodeno_t destdirino,
+                const string& destdirpath,
+                const string& destname,
+                int srcauth
+                ) :
+    Message(MSG_MDS_RENAMENOTIFY) {
+    this->ino = ino;
+    this->srcdirino = srcdirino;
+    this->srcname = srcname;
+    this->destdirino = destdirino;
+    this->destname = destname;
+    this->destdirpath = destdirpath;
+    this->srcauth = srcauth;
+  }
+  virtual char *get_type_name() { return "Rnot";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+    off += sizeof(srcdirino);
+    s.copy(off, sizeof(destdirino), (char*)&destdirino);
+    off += sizeof(destdirino);
+    _unrope(srcname, s, off);
+    _unrope(destname, s, off);
+    _unrope(destdirpath, s, off);
+    s.copy(off, sizeof(srcauth), (char*)&srcauth);
+    off += sizeof(srcauth);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino,sizeof(ino));
+    s.append((char*)&srcdirino,sizeof(srcdirino));
+    s.append((char*)&destdirino,sizeof(destdirino));
+    _rope(srcname, s);
+    _rope(destname, s);
+    _rope(destdirpath, s);
+    s.append((char*)&srcauth, sizeof(srcauth));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenameNotifyAck.h b/branches/sage/cephmds2/messages/MRenameNotifyAck.h
new file mode 100644
index 0000000000000..d1a01339cd97a
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenameNotifyAck.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMENOTIFYACK_H
+#define __MRENAMENOTIFYACK_H
+
+class MRenameNotifyAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+
+  MRenameNotifyAck() {}
+  MRenameNotifyAck(inodeno_t ino) :
+    Message(MSG_MDS_RENAMENOTIFYACK) {
+    this->ino = ino;
+  }
+  virtual char *get_type_name() { return "RnotA";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino,sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenamePrep.h b/branches/sage/cephmds2/messages/MRenamePrep.h
new file mode 100644
index 0000000000000..1af798c674489
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenamePrep.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMEPREP_H
+#define __MRENAMEPREP_H
+
+class MRenamePrep : public Message {
+  inodeno_t srcdirino;
+  string srcname;
+  string srcpath;
+  inodeno_t destdirino;
+  string destname;
+  string destpath;
+  int initiator;
+  int srcauth;
+
+ public:
+  int get_initiator() { return initiator; }
+  inodeno_t get_srcdirino() { return srcdirino; }
+  string& get_srcname() { return srcname; }
+  string& get_srcpath() { return srcpath; }
+  int get_srcauth() { return srcauth; }
+  inodeno_t get_destdirino() { return destdirino; }
+  string& get_destname() { return destname; }
+  string& get_destpath() { return destpath; }
+
+  MRenamePrep() {}
+  MRenamePrep(int initiator,
+              inodeno_t srcdirino,
+              const string& srcname,
+              const string& srcpath,
+              inodeno_t destdirino,
+              const string& destname,
+              const string& destpath,
+              int srcauth) :
+    Message(MSG_MDS_RENAMEPREP) {
+    this->initiator = initiator;
+    this->srcdirino = srcdirino;
+    this->srcname = srcname;
+    this->srcpath = srcpath;
+    this->destdirino = destdirino;
+    this->destname = destname;
+    this->destpath = destpath;
+    this->srcauth = srcauth;
+  }
+  virtual char *get_type_name() { return "RnP";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(initiator), (char*)&initiator);
+    off += sizeof(initiator);
+    s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+    off += sizeof(srcdirino);
+    s.copy(off, sizeof(destdirino), (char*)&destdirino);
+    off += sizeof(destdirino);
+    _unrope(srcname, s, off);
+    _unrope(srcpath, s, off);
+    _unrope(destname, s, off);
+    _unrope(destpath, s, off);
+    s.copy(off, sizeof(srcauth), (char*)&srcauth);
+    off += sizeof(srcauth);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&initiator,sizeof(initiator));
+    s.append((char*)&srcdirino,sizeof(srcdirino));
+    s.append((char*)&destdirino,sizeof(destdirino));
+    _rope(srcname, s);
+    _rope(srcpath, s);
+    _rope(destname, s);
+    _rope(destpath, s);
+    s.append((char*)&srcauth, sizeof(srcauth));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenameReq.h b/branches/sage/cephmds2/messages/MRenameReq.h
new file mode 100644
index 0000000000000..b70e96a38203b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenameReq.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMEREQ_H
+#define __MRENAMEREQ_H
+
+class MRenameReq : public Message {
+  int initiator;
+  inodeno_t srcdirino;
+  string srcname;
+  inodeno_t destdirino;
+  string destname;
+  string destpath;
+  int destauth;
+
+ public:
+  int get_initiator() { return initiator; }
+  inodeno_t get_srcdirino() { return srcdirino; }
+  string& get_srcname() { return srcname; }
+  inodeno_t get_destdirino() { return destdirino; }
+  string& get_destname() { return destname; }
+  string& get_destpath() { return destpath; }
+  int get_destauth() { return destauth; }
+
+  MRenameReq() {}
+  MRenameReq(int initiator,
+             inodeno_t srcdirino,
+             const string& srcname,
+             inodeno_t destdirino,
+             const string& destname,
+             const string& destpath, 
+             int destauth) :
+    Message(MSG_MDS_RENAMEREQ) {
+    this->initiator = initiator;
+    this->srcdirino = srcdirino;
+    this->srcname = srcname;
+    this->destdirino = destdirino;
+    this->destname = destname;
+    this->destpath = destpath;
+    this->destauth = destauth;
+  }
+  virtual char *get_type_name() { return "RnReq";}
+
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(initiator), (char*)&initiator);
+    off += sizeof(initiator);
+    s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+    off += sizeof(srcdirino);
+    s.copy(off, sizeof(destdirino), (char*)&destdirino);
+    off += sizeof(destdirino);
+    _unrope(srcname, s, off);
+    _unrope(destname, s, off);
+    _unrope(destpath, s, off);
+    s.copy(off, sizeof(destauth), (char*)&destauth);
+    off += sizeof(destauth);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&initiator,sizeof(initiator));
+    s.append((char*)&srcdirino,sizeof(srcdirino));
+    s.append((char*)&destdirino,sizeof(destdirino));
+    _rope(srcname, s);
+    _rope(destname, s);
+    _rope(destpath, s);
+    s.append((char*)&destauth, sizeof(destauth));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MRenameWarning.h b/branches/sage/cephmds2/messages/MRenameWarning.h
new file mode 100644
index 0000000000000..85463dfd2c179
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MRenameWarning.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MRENAMEWARNING_H
+#define __MRENAMEWARNING_H
+
+class MRenameWarning : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+
+  MRenameWarning() {}
+  MRenameWarning(inodeno_t ino) :
+    Message(MSG_MDS_RENAMEWARNING) {
+    this->ino = ino;
+  }
+  virtual char *get_type_name() { return "RnW";}
+  
+  virtual void decode_payload(crope& s, int& off) {
+    s.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+  }
+  virtual void encode_payload(crope& s) {
+    s.append((char*)&ino,sizeof(ino));
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDir.h b/branches/sage/cephmds2/messages/MUnhashDir.h
new file mode 100644
index 0000000000000..911a14d9c9592
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDir.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIR_H
+#define __MUNHASHDIR_H
+
+#include "msg/Message.h"
+
+class MUnhashDir : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MUnhashDir() {}
+  MUnhashDir(inodeno_t ino) :
+    Message(MSG_MDS_UNHASHDIR) {
+    this->ino = ino;
+  }  
+  virtual char *get_type_name() { return "UH"; }
+  
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDirAck.h b/branches/sage/cephmds2/messages/MUnhashDirAck.h
new file mode 100644
index 0000000000000..e052683e736c3
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDirAck.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIRACK_H
+#define __MUNHASHDIRACK_H
+
+#include "msg/Message.h"
+
+class MUnhashDirAck : public Message {
+  inodeno_t ino;
+  bufferlist state;
+  int nden;
+  
+ public:  
+  MUnhashDirAck() {}
+  MUnhashDirAck(inodeno_t ino, bufferlist& bl, int nden) : 
+    Message(MSG_MDS_UNHASHDIRACK) {
+    this->ino = ino;
+    state.claim(bl);
+    this->nden = nden;
+  }
+  virtual char *get_type_name() { return "UHaA"; }
+
+  inodeno_t get_ino() { return ino; }
+  bufferlist& get_state() { return state; }
+  bufferlist* get_state_ptr() { return &state; }
+  int       get_nden() { return nden; }
+  
+  //void set_nden(int n) { nden = n; }
+  //void inc_nden() { nden++; }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    payload.copy(off, sizeof(nden), (char*)&nden);
+    off += sizeof(nden);
+
+    size_t len;
+    payload.copy(off, sizeof(len), (char*)&len);
+    off += sizeof(len);
+    state.substr_of(payload, off, len);
+  }
+  void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    payload.append((char*)&nden, sizeof(nden));
+    size_t size = state.length();
+    payload.append((char*)&size, sizeof(size));
+    payload.claim_append(state);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDirNotify.h b/branches/sage/cephmds2/messages/MUnhashDirNotify.h
new file mode 100644
index 0000000000000..a9d6707a3aa25
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDirNotify.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIRNOTIFY_H
+#define __MUNHASHDIRNOTIFY_H
+
+#include "msg/Message.h"
+
+class MUnhashDirNotify : public Message {
+  inodeno_t ino;
+  //int peer;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  //int get_peer() { return peer; }
+
+  MUnhashDirNotify() {}
+  MUnhashDirNotify(inodeno_t ino/*, int peer*/) :
+    Message(MSG_MDS_UNHASHDIRNOTIFY) {
+    this->ino = ino;
+    //this->peer = peer;
+  }  
+  virtual char *get_type_name() { return "UHN"; }
+  
+  virtual void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    //payload.copy(off, sizeof(peer), (char*)&peer);
+    //off += sizeof(peer);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+    //payload.append((char*)&peer, sizeof(peer));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h b/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h
new file mode 100644
index 0000000000000..ad4843676f0fb
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIRNOTIFYACK_H
+#define __MUNHASHDIRNOTIFYACK_H
+
+#include "msg/Message.h"
+
+class MUnhashDirNotifyAck : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MUnhashDirNotifyAck() {}
+  MUnhashDirNotifyAck(inodeno_t ino) :
+    Message(MSG_MDS_UNHASHDIRNOTIFYACK) {
+    this->ino = ino;
+  }  
+  virtual char *get_type_name() { return "UHNa"; }
+  
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDirPrep.h b/branches/sage/cephmds2/messages/MUnhashDirPrep.h
new file mode 100644
index 0000000000000..c4dc2ea422cd9
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDirPrep.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIRPREP_H
+#define __MUNHASHDIRPREP_H
+
+#include "msg/Message.h"
+
+class MUnhashDirPrep : public Message {
+  inodeno_t ino;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  
+  MUnhashDirPrep() {}
+  MUnhashDirPrep(inodeno_t ino) :
+    Message(MSG_MDS_UNHASHDIRPREP) {
+    this->ino = ino;
+  }  
+  virtual char *get_type_name() { return "UHP"; }
+  
+  virtual void decode_payload() {
+    payload.copy(0, sizeof(ino), (char*)&ino);
+  }
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h b/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h
new file mode 100644
index 0000000000000..bd7e93981964b
--- /dev/null
+++ b/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MUNHASHDIRPREPACK_H
+#define __MUNHASHDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MUnhashDirPrepAck : public Message {
+  inodeno_t ino;
+  bool assim;
+
+  // subdir dentry names + inodes 
+  map<string,CInodeDiscover*>    inodes;
+
+ public:
+  inodeno_t get_ino() { return ino; }
+  map<string,CInodeDiscover*>& get_inodes() { return inodes; }
+
+  bool did_assim() { return assim; }
+  void mark_assim() { assert(!assim); assim = true; }
+
+  MUnhashDirPrepAck() : assim(false) { }
+  MUnhashDirPrepAck(inodeno_t ino) :
+    Message(MSG_MDS_UNHASHDIRPREPACK),
+    assim(false) {
+    this->ino = ino;
+  }
+  ~MUnhashDirPrepAck() {
+    for (map<string,CInodeDiscover*>::iterator it = inodes.begin();
+         it != inodes.end();
+         it++) 
+      delete it->second;
+  }
+
+
+  virtual char *get_type_name() { return "HP"; }
+
+  void add_inode(const string& dentry, CInodeDiscover *in) {
+    inodes[dentry] = in;
+  }
+
+  void decode_payload() {
+    int off = 0;
+    payload.copy(off, sizeof(ino), (char*)&ino);
+    off += sizeof(ino);
+    
+    // inodes
+    int ni;
+    payload.copy(off, sizeof(int), (char*)&ni);
+    off += sizeof(int);
+    for (int i=0; i<ni; i++) {
+      // dentry
+      string dname;
+      _decode(dname, payload, off);
+      
+      // inode
+      CInodeDiscover *in = new CInodeDiscover;
+      in->_decode(payload, off);
+      
+      inodes[dname] = in;
+    }
+  }
+
+  virtual void encode_payload() {
+    payload.append((char*)&ino, sizeof(ino));
+
+    // inodes
+    int ni = inodes.size();
+    payload.append((char*)&ni, sizeof(int));
+    for (map<string,CInodeDiscover*>::iterator iit = inodes.begin();
+         iit != inodes.end();
+         iit++) {
+      _encode(iit->first, payload);   // dentry
+      iit->second->_encode(payload);  // inode
+    }
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mon/Elector.cc b/branches/sage/cephmds2/mon/Elector.cc
new file mode 100644
index 0000000000000..a08d0bd7f87df
--- /dev/null
+++ b/branches/sage/cephmds2/mon/Elector.cc
@@ -0,0 +1,227 @@
+
+#include "Elector.h"
+#include "Monitor.h"
+
+#include "common/Timer.h"
+
+#include "messages/MMonElectionRefresh.h"
+#include "messages/MMonElectionStatus.h"
+#include "messages/MMonElectionAck.h"
+#include "messages/MMonElectionCollect.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " "
+#define  derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " "
+
+
+
+class C_Elect_ReadTimer : public Context {
+  Elector *mon;
+public:
+  C_Elect_ReadTimer(Elector *m) : mon(m){}
+  void finish(int r) {
+    mon->read_timer();
+  }
+};
+
+void Elector::read_timer()
+{
+  lock.Lock();
+  {
+    read_num++;
+    status_msg_count = 0;
+    old_views = views;   // TODO deep copy
+    for (unsigned i=0; i<processes.size(); i++) {
+      mon->messenger->send_message(new MMonElectionCollect(read_num), 
+                                   MSG_ADDR_MON(processes[i]));
+    }
+  }
+  lock.Unlock();
+};
+
+class C_Elect_TripTimer : public Context {
+  Elector *mon;
+public:
+  C_Elect_TripTimer(Elector *m) : mon(m){}
+  void finish(int r) {
+    mon->trip_timer();
+  }
+};
+
+void Elector::trip_timer()
+{
+  lock.Lock();
+  {
+    views[whoami].expired = true;
+    registry[whoami].epoch.s_num++;
+    dout(1) << "Process " << whoami
+            <<  " timed out (" << ack_msg_count << "/" << (f + 1)
+            << ") ... increasing epoch. Now epoch is "
+            << registry[whoami].epoch.s_num
+            << endl;
+  }
+  lock.Unlock();
+};
+
+
+
+class C_Elect_RefreshTimer : public Context {
+  Elector *mon;
+public:
+  C_Elect_RefreshTimer(Elector *m) : mon(m) {}
+  void finish(int r) {
+    mon->refresh_timer();
+  }
+};
+
+void Elector::refresh_timer()
+{
+  lock.Lock();
+  {
+    ack_msg_count = 0;
+    refresh_num++;
+    MMonElectionRefresh *msg = new MMonElectionRefresh(whoami, registry[whoami], refresh_num);
+    for (unsigned i=0; i<processes.size(); i++) {
+      mon->messenger->send_message(msg, MSG_ADDR_MON(processes[i]));
+    }
+    
+    // Start the trip timer
+    //round_trip_timer = new C_Elect_TripTimer(this);
+    g_timer.add_event_after(trip_delta, new C_Elect_TripTimer(this));
+  }
+  lock.Unlock();
+};
+
+
+
+//////////////////////////
+
+
+Elector::Epoch Elector::get_min_epoch()
+{
+  assert(!views.empty());
+  Epoch min = views[0].state.epoch;
+  for (unsigned i=1; i<views.size(); i++) {
+    if (views[i].state.epoch < min && !views[i].expired) {
+      min = views[i].state.epoch;
+    }
+  }
+  return min;
+}
+
+
+void Elector::dispatch(Message *m)
+{
+  lock.Lock();
+  {
+    switch (m->get_type()) {
+    case MSG_MON_ELECTION_ACK:
+      handle_ack((MMonElectionAck*)m);
+      break;
+    
+    case MSG_MON_ELECTION_STATUS:
+      handle_status((MMonElectionStatus*)m);
+      break;
+    
+    case MSG_MON_ELECTION_COLLECT:
+      handle_collect((MMonElectionCollect*)m);
+      break;
+    
+    case MSG_MON_ELECTION_REFRESH:
+      handle_refresh((MMonElectionRefresh*)m);
+      break;
+      
+    default:
+      assert(0);
+    }
+  }
+  lock.Unlock();
+}
+
+void Elector::handle_ack(MMonElectionAck* msg)
+{
+  assert(refresh_num >= msg->refresh_num);
+  
+  if (refresh_num > msg->refresh_num) {
+    // we got the message too late... discard it
+    return;
+  }
+  ack_msg_count++;
+  if (ack_msg_count >= f + 1) {
+    dout(5) << "Received _f+1 acks, increase freshness" << endl;
+    //g_timer.cancel_event(round_trip_task);
+    //round_trip_timer->cancel();
+    registry[whoami].freshness++;         
+  }
+  
+  delete msg;
+}
+
+void Elector::handle_collect(MMonElectionCollect* msg)
+{
+  mon->messenger->send_message(new MMonElectionStatus(msg->get_source().num(),
+                                                      msg->read_num,
+                                                      registry),
+                               msg->get_source());
+  delete msg;
+}
+
+void Elector::handle_refresh(MMonElectionRefresh* msg)
+{
+  if (registry[msg->p] < msg->state) {
+    // update local data
+    registry[msg->p] = msg->state;
+
+    // reply to msg
+    mon->messenger->send_message(new MMonElectionAck(msg->p, 
+                                                     msg->refresh_num), 
+                                 msg->get_source());
+  }
+
+  delete msg;
+}
+
+
+void Elector::handle_status(MMonElectionStatus* msg)
+{
+  if (read_num != msg->read_num) {
+    dout(1) << "handle_status "
+            << ":DISCARDED B/C OF READNUM(" << read_num << ":"
+            << msg->read_num << ")" 
+            << endl;
+    return;
+  }
+  for (unsigned i=0; i<processes.size(); i++) {
+    int r = processes[i];
+    // Put in the view the max value between then new state and the stored one
+    if ( msg->registry[r] > views[r].state ) {
+      views[r].state = msg->registry[r];
+    }
+  }
+        
+  status_msg_count++;
+  if (status_msg_count >= (int)processes.size() - f) { // Responses from quorum collected
+    for (unsigned i=0; i<processes.size(); i++) {
+      int r = processes[i];
+      // Check if r has refreshed its epoch number
+      if (!( views[r].state > old_views[r].state )) {
+        dout(5) << ":Other process (" << r << ") has expired" << endl;
+        views[r].expired = true;
+      }
+      if (views[r].state.epoch > old_views[r].state.epoch) {
+        views[r].expired = false;
+      }
+    }
+    Epoch leader_epoch = get_min_epoch();
+    leader_id = leader_epoch.p_id;
+    dout(1) << " thinks leader has ID: " << leader_id << endl;
+    
+    // Restarts the timer for the next iteration
+    g_timer.add_event_after(main_delta + trip_delta, new C_Elect_ReadTimer(this));
+  }
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/mon/Elector.h b/branches/sage/cephmds2/mon/Elector.h
new file mode 100644
index 0000000000000..7ec3a40a59130
--- /dev/null
+++ b/branches/sage/cephmds2/mon/Elector.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MON_ELECTOR_H
+#define __MON_ELECTOR_H
+
+#include <map>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+
+class Monitor;
+
+
+class Elector {
+ public:
+
+  //// sub-classes
+
+  // Epoch
+  class Epoch {
+  public:
+    int p_id;
+    int s_num;
+    
+    Epoch(int p_id=0, int s_num=0) {
+      this->p_id = p_id;
+      this->s_num = s_num;
+    }
+  };    
+
+
+  // State
+  class State {
+  public:
+    Epoch epoch;
+    int freshness;
+
+    State() : freshness(0) {};
+    State(Epoch& e, int f) :
+      epoch(e), freshness(f) {}
+  };
+
+
+  class View {
+  public:
+    State state;
+    bool expired;
+    View() : expired(false) {}
+    View(State& s, bool e) : state(s), expired(e) {}
+  };
+
+
+  ///////////////
+ private:
+  Monitor *mon;
+  int whoami;
+  Mutex lock;
+
+  // used during refresh phase
+  int ack_msg_count;
+  int refresh_num;
+  
+  // used during read phase
+  int read_num;
+  int status_msg_count;
+  
+  // the leader process id
+  int leader_id;
+  // f-accessible
+  int f;
+  
+  // the processes that compose the group
+  vector<int> processes;
+  // parameters for the process
+  int main_delta;
+  int trip_delta;
+  
+  // state variables
+  map<int, State> registry;
+  map<int, View>  views;
+  map<int, View>  old_views;
+
+  // get the minimum epoch in the view map
+  Epoch get_min_epoch();
+  
+  // handlers for election messages
+  void handle_ack(class MMonElectionAck *m);
+  void handle_collect(class MMonElectionCollect *m);
+  void handle_refresh(class MMonElectionRefresh *m);
+  void handle_status(class MMonElectionStatus *m);
+
+ public:  
+  Elector(Monitor *m, int w) : mon(m), whoami(w) {
+    // initialize all those values!
+    // ...
+  }
+
+  // timer methods
+  void read_timer();
+  void trip_timer();
+  void refresh_timer();
+  
+  void dispatch(Message *m);
+
+};
+
+
+inline bool operator>(const Elector::Epoch& l, const Elector::Epoch& r) {
+  if (l.s_num == r.s_num)
+    return (l.p_id > r.p_id);
+  else
+    return (l.s_num > r.s_num);
+}
+
+inline bool operator<(const Elector::Epoch& l, const Elector::Epoch& r) {
+  if (l.s_num == r.s_num)
+    return (l.p_id < r.p_id);
+  else
+    return (l.s_num < r.s_num);
+}
+
+inline bool operator==(const Elector::Epoch& l, const Elector::Epoch& r) {
+  return ((l.s_num == r.s_num) && (l.p_id > r.p_id));
+}
+
+  
+inline bool operator>(const Elector::State& l, const Elector::State& r) 
+{
+  if (l.epoch == r.epoch)
+    return (l.freshness > r.freshness);
+  else
+    return l.epoch > r.epoch;
+}
+ 
+inline bool operator<(const Elector::State& l, const Elector::State& r) 
+{
+  if (l.epoch == r.epoch)
+    return (l.freshness < r.freshness);
+  else
+    return l.epoch < r.epoch;
+}
+ 
+inline bool operator==(const Elector::State& l, const Elector::State& r) 
+{
+  return ( (l.epoch == r.epoch) && (l.freshness == r.freshness) );
+}
+
+
+#endif
diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc
new file mode 100644
index 0000000000000..e2e2553670fe7
--- /dev/null
+++ b/branches/sage/cephmds2/mon/MDSMonitor.cc
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "MDSMonitor.h"
+#include "Monitor.h"
+
+#include "messages/MMDSBoot.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMDSGetMap.h"
+//#include "messages/MMDSFailure.h"
+
+#include "common/Timer.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " "
+#define  derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " "
+
+
+
+/********* MDS map **************/
+
+void MDSMonitor::create_initial()
+{
+  mdsmap.epoch = 0;  // until everyone boots
+  mdsmap.ctime = g_clock.now();
+  for (int i=0; i<g_conf.num_mds; i++) {
+    mdsmap.all_mds.insert(i);
+    mdsmap.down_mds.insert(i);
+  }
+}
+
+void MDSMonitor::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+
+  case MSG_MDS_BOOT:
+    handle_mds_boot((MMDSBoot*)m);
+    break;
+    
+  case MSG_MDS_GETMAP:
+    handle_mds_getmap((MMDSGetMap*)m);
+    break;
+    
+    /*
+      case MSG_MDS_FAILURE:
+      handle_mds_failure((MMDSFailure*)m);
+      break;
+    */
+    
+  case MSG_SHUTDOWN:
+    handle_mds_shutdown(m);
+    break;
+    
+  default:
+    assert(0);
+  }  
+}
+
+void MDSMonitor::handle_mds_boot(MMDSBoot *m)
+{
+  dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl;
+  assert(m->get_source().is_mds());
+  int from = m->get_source().num();
+  
+  if (mdsmap.get_epoch() == 0) {
+    // waiting for boot!
+    mdsmap.mds_inst[from] = m->get_source_inst();
+    mdsmap.down_mds.erase(from);
+    
+    if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) {
+      mdsmap.inc_epoch();
+      dout(-7) << "mds_boot all MDSs booted." << endl;
+      mdsmap.encode(maps[mdsmap.get_epoch()]); // 1
+      
+      bcast_latest_mds();
+      send_current();
+    } else {
+      dout(7) << "mds_boot waiting for " 
+              << (mdsmap.get_num_mds() - mdsmap.mds_inst.size())
+              << " mdss to boot" << endl;
+    }
+    return;
+  } else {
+    dout(0) << "mds_boot everyone already booted, so who is this?  write me." << endl;
+    assert(0);
+  }
+}
+
+void MDSMonitor::handle_mds_shutdown(Message *m)
+{
+  assert(m->get_source().is_mds());
+  int from = m->get_source().num();
+
+  mdsmap.mds_inst.erase(from);
+  mdsmap.all_mds.erase(from);
+
+  dout(7) << "mds_shutdown from " << m->get_source() 
+	  << ", still have " << mdsmap.all_mds
+	  << endl;
+  
+  // tell someone?
+  // fixme
+  
+  delete m;
+}
+
+
+void MDSMonitor::handle_mds_getmap(MMDSGetMap *m)
+{
+  dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl;
+  if (mdsmap.get_epoch() > 0)
+    send_full(m->get_source(), m->get_source_inst());
+  else
+    awaiting_map[m->get_source()] = m->get_source_inst();
+}
+
+
+void MDSMonitor::bcast_latest_mds()
+{
+  dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl;
+  
+  // tell mds
+  for (set<int>::iterator p = mdsmap.get_mds().begin();
+       p != mdsmap.get_mds().end();
+       p++) {
+    if (mdsmap.is_down(*p)) continue;
+    send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p));
+  }
+}
+
+void MDSMonitor::send_full(msg_addr_t dest, const entity_inst_t& inst)
+{
+  dout(11) << "send_full to " << dest << " inst " << inst << endl;
+  messenger->send_message(new MMDSMap(&mdsmap), dest, inst);
+}
+
+void MDSMonitor::send_current()
+{
+  dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl;
+  for (map<msg_addr_t,entity_inst_t>::iterator i = awaiting_map.begin();
+       i != awaiting_map.end();
+       i++) 
+    send_full(i->first, i->second);
+  awaiting_map.clear();
+}
+
diff --git a/branches/sage/cephmds2/mon/MDSMonitor.h b/branches/sage/cephmds2/mon/MDSMonitor.h
new file mode 100644
index 0000000000000..66e28451e1de4
--- /dev/null
+++ b/branches/sage/cephmds2/mon/MDSMonitor.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MDSMONITOR_H
+#define __MDSMONITOR_H
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "mds/MDSMap.h"
+
+class Monitor;
+
+class MDSMonitor : public Dispatcher {
+  Monitor *mon;
+  Messenger *messenger;
+  Mutex &lock;
+
+  // mds maps
+ public:
+  MDSMap mdsmap;
+
+ private:
+  map<epoch_t, bufferlist> maps;
+
+  //map<epoch_t, bufferlist> inc_maps;
+  //MDSMap::Incremental pending_inc;
+  
+  map<msg_addr_t,entity_inst_t> awaiting_map;
+  
+
+  // maps
+  void create_initial();
+  void send_current();         // send current map to waiters.
+  void send_full(msg_addr_t dest, const entity_inst_t& inst);
+  void bcast_latest_mds();
+
+  //void accept_pending();   // accept pending, new map.
+  //void send_incremental(epoch_t since, msg_addr_t dest);
+
+  void handle_mds_boot(class MMDSBoot *m);
+  void handle_mds_failure(class MMDSFailure *m);
+  void handle_mds_getmap(class MMDSGetMap *m);
+  void handle_mds_shutdown(Message *m);
+
+ public:
+  MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) {
+    create_initial();
+  }
+
+  void dispatch(Message *m);
+  void tick();  // check state, take actions
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mon/MonMap.h b/branches/sage/cephmds2/mon/MonMap.h
new file mode 100644
index 0000000000000..e72946d76cf06
--- /dev/null
+++ b/branches/sage/cephmds2/mon/MonMap.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __MONMAP_H
+#define __MONMAP_H
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MonMap {
+ public:
+  epoch_t   epoch;       // what epoch of the osd cluster descriptor is this
+  int       num_mon;
+  vector<entity_inst_t> mon_inst;
+
+  int       last_mon;    // last mon i talked to
+
+  MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {}
+
+  // pick a mon.  
+  // choice should be stable, unless we explicitly ask for a new one.
+  int pick_mon(bool newmon=false) { 
+    if (newmon || (last_mon < 0)) {
+      last_mon = 0;  //last_mon = rand() % num_mon;
+    }
+    return last_mon;    
+  }
+
+  const entity_inst_t get_inst(int m) {
+    assert(m < num_mon);
+    return mon_inst[m];
+  }
+
+  void encode(bufferlist& blist) {
+    blist.append((char*)&epoch, sizeof(epoch));
+    blist.append((char*)&num_mon, sizeof(num_mon));
+    
+    _encode(mon_inst, blist);
+  }
+  
+  void decode(bufferlist& blist) {
+    int off = 0;
+    blist.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    blist.copy(off, sizeof(num_mon), (char*)&num_mon);
+    off += sizeof(num_mon);
+
+    _decode(mon_inst, blist, off);
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mon/Monitor.cc b/branches/sage/cephmds2/mon/Monitor.cc
new file mode 100644
index 0000000000000..e0462534553d6
--- /dev/null
+++ b/branches/sage/cephmds2/mon/Monitor.cc
@@ -0,0 +1,260 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer
+
+#include "Monitor.h"
+
+#include "osd/OSDMap.h"
+
+#include "ebofs/Ebofs.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MGenericMessage.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#include "OSDMonitor.h"
+#include "MDSMonitor.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " "
+#define  derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " "
+
+
+
+void Monitor::init()
+{
+  dout(1) << "init" << endl;
+  
+  // store
+  char s[80];
+  sprintf(s, "dev/mon%d", whoami);
+  store = new Ebofs(s);
+
+  if (g_conf.mkfs)
+    store->mkfs();
+  int r = store->mount();
+  assert(r >= 0);
+
+  // create 
+  osdmon = new OSDMonitor(this, messenger, lock);
+  mdsmon = new MDSMonitor(this, messenger, lock);
+
+  // i'm ready!
+  messenger->set_dispatcher(this);
+  
+  // start ticker
+  reset_tick();
+}
+
+void Monitor::shutdown()
+{
+  dout(1) << "shutdown" << endl;
+
+  cancel_tick();
+
+  if (store) {
+    store->umount();
+    delete store;
+  }
+  
+  // stop osds.
+  for (set<int>::iterator it = osdmon->osdmap.get_osds().begin();
+       it != osdmon->osdmap.get_osds().end();
+       it++) {
+    if (osdmon->osdmap.is_down(*it)) continue;
+    dout(10) << "sending shutdown to osd" << *it << endl;
+    messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
+			    MSG_ADDR_OSD(*it), osdmon->osdmap.get_inst(*it));
+  }
+  
+  // monitors too.
+  for (int i=0; i<monmap->num_mon; i++)
+    if (i != whoami)
+      messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), 
+			      MSG_ADDR_MON(i), monmap->get_inst(i));
+
+  // clean up
+  if (monmap) delete monmap;
+  if (osdmon) delete osdmon;
+  if (mdsmon) delete mdsmon;
+
+  // die.
+  messenger->shutdown();
+  delete messenger;
+}
+
+
+void Monitor::call_election()
+{
+  dout(10) << "call_election" << endl;
+  state = STATE_STARTING;
+
+  osdmon->election_starting();
+  //mdsmon->election_starting();
+}
+
+
+
+
+
+void Monitor::dispatch(Message *m)
+{
+  lock.Lock();
+  {
+    switch (m->get_type()) {
+
+      // misc
+    case MSG_PING_ACK:
+      handle_ping_ack((MPingAck*)m);
+      break;
+
+    case MSG_SHUTDOWN:
+      if (m->get_source().is_mds()) {
+	mdsmon->dispatch(m);
+	if (mdsmon->mdsmap.get_num_mds() == 0) 
+	  shutdown();
+      }
+      else if (m->get_source().is_osd()) {
+	osdmon->dispatch(m);
+      }
+      break;
+
+
+      // OSDs
+    case MSG_OSD_GETMAP:
+    case MSG_OSD_FAILURE:
+    case MSG_OSD_BOOT:
+    case MSG_OSD_IN:
+    case MSG_OSD_OUT:
+      osdmon->dispatch(m);
+      break;
+
+      
+      // MDSs
+    case MSG_MDS_BOOT:
+    case MSG_MDS_GETMAP:
+      mdsmon->dispatch(m);
+      break;
+
+
+      // elector messages
+    case MSG_MON_ELECTION_ACK:
+    case MSG_MON_ELECTION_STATUS:
+    case MSG_MON_ELECTION_COLLECT:
+    case MSG_MON_ELECTION_REFRESH:
+      elector.dispatch(m);
+      break;
+
+      
+    default:
+      dout(0) << "unknown message " << *m << endl;
+      assert(0);
+    }
+  }
+  lock.Unlock();
+}
+
+
+void Monitor::handle_shutdown(Message *m)
+{
+  dout(1) << "shutdown from " << m->get_source() << endl;
+
+  shutdown();
+  delete m;
+}
+
+void Monitor::handle_ping_ack(MPingAck *m)
+{
+  // ...
+  
+  delete m;
+}
+
+
+
+
+/************ TIMER ***************/
+
+class C_Mon_Tick : public Context {
+  Monitor *mon;
+public:
+  C_Mon_Tick(Monitor *m) : mon(m) {}
+  void finish(int r) {
+    mon->tick(this);
+  }
+};
+
+
+void Monitor::cancel_tick()
+{
+  if (!tick_timer) return;
+
+  if (g_timer.cancel_event(tick_timer)) {
+    dout(10) << "cancel_tick canceled" << endl;
+  } else {
+    // already dispatched!
+    dout(10) << "cancel_tick timer dispatched, waiting to cancel" << endl;
+    tick_timer = (Context*)1;  // hackish.
+    while (tick_timer)
+      tick_timer_cond.Wait(lock);    
+  }
+}
+
+void Monitor::reset_tick()
+{
+  if (tick_timer) 
+    cancel_tick();
+  tick_timer = new C_Mon_Tick(this);
+  g_timer.add_event_after(g_conf.mon_tick_interval, tick_timer);
+}
+
+
+void Monitor::tick(Context *timer)
+{
+  lock.Lock();
+  {
+    if (tick_timer != timer) {
+      dout(10) << "tick - canceled" << endl;
+      tick_timer = 0;
+      tick_timer_cond.Signal();
+      lock.Unlock();
+      return;
+    }
+
+    tick_timer = 0;
+
+    // ok go.
+    dout(10) << "tick" << endl;
+
+    osdmon->tick();
+
+    // next tick!
+    reset_tick();
+  }
+  lock.Unlock();
+}
+
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/mon/Monitor.h b/branches/sage/cephmds2/mon/Monitor.h
new file mode 100644
index 0000000000000..0b8890fcbae3b
--- /dev/null
+++ b/branches/sage/cephmds2/mon/Monitor.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MONITOR_H
+#define __MONITOR_H
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "MonMap.h"
+#include "Elector.h"
+
+class ObjectStore;
+class OSDMonitor;
+class MDSMonitor;
+
+class Monitor : public Dispatcher {
+protected:
+  // me
+  int whoami;
+  Messenger *messenger;
+  Mutex lock;
+
+  MonMap *monmap;
+
+  // timer.
+  Context *tick_timer;
+  Cond     tick_timer_cond;
+  void cancel_tick();
+  void reset_tick();
+  friend class C_Mon_Tick;
+
+  // my local store
+  ObjectStore *store;
+
+  const static int INO_ELECTOR = 1;
+  const static int INO_MON_MAP = 2;
+  const static int INO_OSD_MAP = 10;
+  const static int INO_OSD_INC_MAP = 11;
+  const static int INO_MDS_MAP = 20;
+
+  // elector
+  Elector elector;
+  friend class Elector;
+
+  epoch_t  mon_epoch;    // monitor epoch (election instance)
+  set<int> quorum;       // current active set of monitors (if !starting)
+
+  void call_election();
+
+  // monitor state
+  const static int STATE_STARTING = 0;
+  const static int STATE_LEADER = 1;
+  const static int STATE_PEON =   2;
+  int state;
+
+  int leader;                    // current leader (to best of knowledge)
+  utime_t last_called_election;  // [starting] last time i called an election
+
+  bool is_starting() { return state == STATE_STARTING; }
+  bool is_leader() { return state == STATE_LEADER; }
+  bool is_peon() { return state == STATE_PEON; }
+
+  // my public services
+  OSDMonitor *osdmon;
+  MDSMonitor *mdsmon;
+
+  // messages
+  void handle_shutdown(Message *m);
+  void handle_ping_ack(class MPingAck *m);
+
+  friend class OSDMonitor;
+  friend class MDSMonitor;
+
+ public:
+  Monitor(int w, Messenger *m, MonMap *mm) : 
+    whoami(w), 
+    messenger(m),
+    monmap(mm),
+    tick_timer(0),
+    store(0),
+    elector(this, w),
+    mon_epoch(0), 
+    state(STATE_STARTING),
+    leader(0),
+    osdmon(0),
+    mdsmon(0)
+  {
+    // hack leader, until election works.
+    if (whoami == 0)
+      state = STATE_LEADER;
+    else
+      state = STATE_PEON;
+  }
+
+  void init();
+  void shutdown();
+  void dispatch(Message *m);
+  void tick(Context *timer);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mon/OSDMonitor.cc b/branches/sage/cephmds2/mon/OSDMonitor.cc
new file mode 100644
index 0000000000000..7fafbff48b2f1
--- /dev/null
+++ b/branches/sage/cephmds2/mon/OSDMonitor.cc
@@ -0,0 +1,869 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "OSDMonitor.h"
+#include "Monitor.h"
+#include "MDSMonitor.h"
+
+#include "osd/ObjectStore.h"
+
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "messages/MMonOSDMapInfo.h"
+#include "messages/MMonOSDMapLease.h"
+#include "messages/MMonOSDMapLeaseAck.h"
+#include "messages/MMonOSDMapUpdatePrepare.h"
+#include "messages/MMonOSDMapUpdateAck.h"
+#include "messages/MMonOSDMapUpdateCommit.h"
+
+#include "common/Timer.h"
+
+#include "config.h"
+#undef dout
+#define  dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " "
+#define  derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " "
+
+
+class C_Mon_FakeOSDFailure : public Context {
+  OSDMonitor *mon;
+  int osd;
+  bool down;
+public:
+  C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {}
+  void finish(int r) {
+    mon->fake_osd_failure(osd,down);
+  }
+};
+
+
+void OSDMonitor::fake_osdmap_update()
+{
+  dout(1) << "fake_osdmap_update" << endl;
+  accept_pending();
+
+  // tell a random osd
+  int osd = rand() % g_conf.num_osd;
+  send_incremental(osdmap.get_epoch()-1,                     // ick! FIXME
+				   MSG_ADDR_OSD(osd), osdmap.get_inst(osd));
+}
+
+
+void OSDMonitor::fake_reorg() 
+{
+  int r = rand() % g_conf.num_osd;
+  
+  if (osdmap.is_out(r)) {
+    dout(1) << "fake_reorg marking osd" << r << " in" << endl;
+    pending_inc.new_in.push_back(r);
+  } else {
+    dout(1) << "fake_reorg marking osd" << r << " out" << endl;
+    pending_inc.new_out.push_back(r);
+  }
+
+  accept_pending();
+  
+  // tell him!
+  send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(r), osdmap.get_inst(r));
+  
+  // do it again?
+  /*
+  if (g_conf.num_osd - d > 4 &&
+      g_conf.num_osd - d > g_conf.num_osd/2)
+    g_timer.add_event_after(g_conf.fake_osdmap_expand,
+                            new C_Mon_Faker(this));
+  */
+}
+
+
+
+void OSDMonitor::init()
+{
+  // start with blank map
+
+  // load my last state from the store
+  bufferlist bl;
+  if (get_map_bl(0, bl)) {  // FIXME
+    // yay!
+    osdmap.decode(bl);
+    dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl;
+
+    // set up pending_inc
+    pending_inc.epoch = osdmap.get_epoch()+1;
+
+  } else {
+    // FIXME. when elections work!
+    if (mon->is_leader()) {
+      create_initial();
+      issue_leases();
+    }
+  }
+}
+
+
+
+
+/************ MAPS ****************/
+
+
+void OSDMonitor::create_initial()
+{
+  dout(1) << "create_initial generating osdmap from g_conf" << endl;
+
+  // <HACK set up OSDMap from g_conf>
+  osdmap.mon_epoch = mon->mon_epoch;
+  osdmap.ctime = g_clock.now();
+
+  if (g_conf.osd_pg_bits) {
+    osdmap.set_pg_bits(g_conf.osd_pg_bits);
+  } else {
+    int osdbits = 1;
+    int n = g_conf.num_osd;
+    while (n) {
+      n = n >> 1;
+      osdbits++;
+    }
+
+    // 2 bits per osd.
+    osdmap.set_pg_bits(osdbits + 2);
+  }
+  
+  // start at epoch 0 until all osds boot
+  //osdmap.inc_epoch();  // = 1
+  //assert(osdmap.get_epoch() == 1);
+  
+  if (g_conf.num_osd >= 12) {
+    int ndom = g_conf.osd_max_rep;
+    UniformBucket *domain[ndom];
+    int domid[ndom];
+    for (int i=0; i<ndom; i++) {
+      domain[i] = new UniformBucket(1, 0);
+      domid[i] = osdmap.crush.add_bucket(domain[i]);
+    }
+    
+    // add osds
+    int nper = ((g_conf.num_osd - 1) / ndom) + 1;
+    cerr << ndom << " failure domains, " << nper << " osds each" << endl;
+    int i = 0;
+    for (int dom=0; dom<ndom; dom++) {
+      for (int j=0; j<nper; j++) {
+	osdmap.osds.insert(i);
+	domain[dom]->add_item(i, 1.0);
+	//cerr << "osd" << i << " in domain " << dom << endl;
+	i++;
+	if (i == g_conf.num_osd) break;
+      }
+    }
+    
+    // root
+    Bucket *root = new ListBucket(2);
+    for (int i=0; i<ndom; i++) {
+      //cerr << "dom " << i << " w " << domain[i]->get_weight() << endl;
+      root->add_item(domid[i], domain[i]->get_weight());
+    }
+    int nroot = osdmap.crush.add_bucket(root);    
+    
+    // rules
+    for (int i=1; i<=ndom; i++) {
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));      
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+    }
+    
+    // test
+    //vector<int> out;
+    //osdmap.pg_to_osds(0x40200000110ULL, out);
+    
+  } else {
+    // one bucket
+    Bucket *b = new UniformBucket(1, 0);
+    int root = osdmap.crush.add_bucket(b);
+    for (int i=0; i<g_conf.num_osd; i++) {
+      osdmap.osds.insert(i);
+      b->add_item(i, 1.0);
+    }
+    
+    for (int i=1; i<=g_conf.osd_max_rep; i++) {
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
+      osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+    }
+  }
+  
+  if (g_conf.mds_local_osd) {
+    // add mds osds, but don't put them in the crush mapping func
+    for (int i=0; i<g_conf.num_mds; i++) 
+      osdmap.osds.insert(i+10000);
+  }
+  
+  // </HACK>
+  
+  // fake osd failures
+  for (map<int,float>::iterator i = g_fake_osd_down.begin();
+	   i != g_fake_osd_down.end();
+	   i++) {
+	dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
+	g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1));
+  }
+  for (map<int,float>::iterator i = g_fake_osd_out.begin();
+	   i != g_fake_osd_out.end();
+	   i++) {
+	dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
+	g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0));
+  }
+}
+
+
+bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl)
+{
+  object_t oid(Monitor::INO_OSD_MAP, epoch);
+  if (!mon->store->exists(oid))
+    return false;
+  int r = mon->store->read(oid, 0, 0, bl);
+  assert(r > 0);
+  return true;  
+}
+
+bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl)
+{
+  object_t oid(Monitor::INO_OSD_INC_MAP, epoch);
+  if (!mon->store->exists(oid))
+    return false;
+  int r = mon->store->read(oid, 0, 0, bl);
+  assert(r > 0);
+  return true;  
+}
+
+
+void OSDMonitor::save_map()
+{
+  bufferlist bl;
+  osdmap.encode(bl);
+
+  ObjectStore::Transaction t;
+  t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl);
+  t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); 
+  mon->store->apply_transaction(t);
+  mon->store->sync();
+}
+
+void OSDMonitor::save_inc_map(OSDMap::Incremental &inc)
+{
+  bufferlist bl;
+  osdmap.encode(bl);
+
+  bufferlist incbl;
+  inc.encode(incbl);
+
+  ObjectStore::Transaction t;
+  t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl);
+  t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl);         // not strictly needed??
+  t.write(object_t(Monitor::INO_OSD_INC_MAP,osdmap.get_epoch()), 0, incbl.length(), incbl); 
+  mon->store->apply_transaction(t);
+  mon->store->sync();
+}
+
+
+
+void OSDMonitor::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+    
+    // services
+  case MSG_OSD_GETMAP:
+    handle_osd_getmap((MOSDGetMap*)m);
+    break;
+  case MSG_OSD_FAILURE:
+    handle_osd_failure((MOSDFailure*)m);
+    break;
+  case MSG_OSD_BOOT:
+    handle_osd_boot((MOSDBoot*)m);
+    break;
+  case MSG_OSD_IN:
+    handle_osd_in((MOSDIn*)m);
+    break;
+  case MSG_OSD_OUT:
+    handle_osd_out((MOSDOut*)m);
+    break;
+    
+    // replication
+  case MSG_MON_OSDMAP_INFO:
+    handle_info((MMonOSDMapInfo*)m);
+    break;
+  case MSG_MON_OSDMAP_LEASE:
+    handle_lease((MMonOSDMapLease*)m);
+    break;
+  case MSG_MON_OSDMAP_LEASE_ACK:
+    handle_lease_ack((MMonOSDMapLeaseAck*)m);
+    break;
+  case MSG_MON_OSDMAP_UPDATE_PREPARE:
+    handle_update_prepare((MMonOSDMapUpdatePrepare*)m);
+    break;
+  case MSG_MON_OSDMAP_UPDATE_ACK:
+    handle_update_ack((MMonOSDMapUpdateAck*)m);
+    break;
+  case MSG_MON_OSDMAP_UPDATE_COMMIT:
+    handle_update_commit((MMonOSDMapUpdateCommit*)m);
+    break;
+    
+  default:
+    assert(0);
+  }
+}
+
+
+
+void OSDMonitor::handle_osd_failure(MOSDFailure *m)
+{
+  dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl;
+  
+  // FIXME
+  // take their word for it
+  int from = m->get_failed().num();
+  if (osdmap.is_up(from) &&
+      (osdmap.osd_inst.count(from) == 0 ||
+       osdmap.osd_inst[from] == m->get_inst())) {
+    pending_inc.new_down[from] = m->get_inst();
+    
+    if (osdmap.is_in(from))
+      down_pending_out[from] = g_clock.now();
+    
+    //awaiting_maps[pending_inc.epoch][m->get_source()] = 
+    
+    accept_pending();
+    
+    send_incremental(m->get_epoch(), m->get_source(), m->get_source_inst());
+    
+    send_waiting();
+    bcast_latest_mds();   
+  }
+  
+  delete m;
+}
+
+
+void OSDMonitor::fake_osd_failure(int osd, bool down) 
+{
+  lock.Lock();
+  {
+    if (down) {
+      dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
+      pending_inc.new_down[osd] = osdmap.osd_inst[osd];
+    } else {
+      dout(1) << "fake_osd_failure OUT osd" << osd << endl;
+      pending_inc.new_out.push_back(osd);
+    }
+    accept_pending();
+    bcast_latest_osd();
+    bcast_latest_mds();
+  }
+  lock.Unlock();
+}
+
+
+void OSDMonitor::handle_osd_boot(MOSDBoot *m)
+{
+  dout(7) << "osd_boot from " << m->get_source() << endl;
+  assert(m->get_source().is_osd());
+  int from = m->get_source().num();
+  
+  if (osdmap.get_epoch() == 0) {
+    // waiting for boot!
+    osdmap.osd_inst[from] = m->get_source_inst();
+
+    if (osdmap.osd_inst.size() == osdmap.osds.size()) {
+      dout(-7) << "osd_boot all osds booted." << endl;
+      osdmap.inc_epoch();
+      
+      save_map();
+      
+      pending_inc.epoch = osdmap.get_epoch()+1;     // 2
+
+      bcast_latest_osd();
+      bcast_latest_mds();
+    } else {
+      dout(7) << "osd_boot waiting for " 
+              << (osdmap.osds.size() - osdmap.osd_inst.size())
+              << " osds to boot" << endl;
+    }
+    return;
+  }
+  
+  // already up?  mark down first?
+  if (osdmap.is_up(from)) {
+    pending_inc.new_down[from] = osdmap.osd_inst[from];
+    accept_pending();
+  }
+  
+  // mark up.
+  down_pending_out.erase(from);
+  assert(osdmap.is_down(from));
+  pending_inc.new_up[from] = m->get_source_inst();
+  
+  // mark in?
+  if (osdmap.out_osds.count(from)) 
+    pending_inc.new_in.push_back(from);
+  
+  accept_pending();
+  
+  // the booting osd will spread word
+  send_incremental(m->sb.current_epoch, m->get_source(), m->get_source_inst());
+  delete m;
+
+  // tell mds
+  bcast_latest_mds();
+}
+
+void OSDMonitor::handle_osd_in(MOSDIn *m)
+{
+  dout(7) << "osd_in from " << m->get_source() << endl;
+  int from = m->get_source().num();
+  
+  if (osdmap.is_out(from)) 
+    pending_inc.new_in.push_back(from);
+  accept_pending();
+  send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+}
+
+void OSDMonitor::handle_osd_out(MOSDOut *m)
+{
+  dout(7) << "osd_out from " << m->get_source() << endl;
+  int from = m->get_source().num();
+  if (osdmap.is_in(from)) {
+    pending_inc.new_out.push_back(from);
+    accept_pending();
+    send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+  }
+}
+
+void OSDMonitor::handle_osd_getmap(MOSDGetMap *m)
+{
+  dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl;
+  
+  if (osdmap.get_epoch() == 0) {
+    awaiting_map[m->get_source()].first = m->get_source_inst();
+    awaiting_map[m->get_source()].second = m->get_since();
+  } else {
+    //if (m->get_since())
+    send_incremental(m->get_since(), m->get_source(), m->get_source_inst());
+    //else
+    //send_full(m->get_source(), m->get_source_inst());
+  }
+  delete m;
+}
+
+
+
+void OSDMonitor::accept_pending()
+{
+  dout(-10) << "accept_pending " << osdmap.get_epoch() << " -> " << pending_inc.epoch << endl;
+
+  // accept pending into a new map!
+  pending_inc.ctime = g_clock.now();
+  pending_inc.mon_epoch = mon->mon_epoch;
+
+  // advance!
+  osdmap.apply_incremental(pending_inc);
+  
+  // save it.
+  save_inc_map( pending_inc );
+  
+  // tell me about it
+  for (map<int,entity_inst_t>::iterator i = pending_inc.new_up.begin();
+       i != pending_inc.new_up.end(); 
+       i++) { 
+    dout(0) << "osd" << i->first << " UP " << i->second << endl;
+    derr(0) << "osd" << i->first << " UP " << i->second << endl;
+    messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+  }
+  for (map<int,entity_inst_t>::iterator i = pending_inc.new_down.begin();
+       i != pending_inc.new_down.end();
+       i++) {
+    dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
+    derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
+    messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+  }
+  for (list<int>::iterator i = pending_inc.new_in.begin();
+       i != pending_inc.new_in.end();
+       i++) {
+    dout(0) << "osd" << *i << " IN" << endl;
+    derr(0) << "osd" << *i << " IN" << endl;
+  }
+  for (list<int>::iterator i = pending_inc.new_out.begin();
+       i != pending_inc.new_out.end();
+       i++) {
+    dout(0) << "osd" << *i << " OUT" << endl;
+    derr(0) << "osd" << *i << " OUT" << endl;
+  }
+
+  // clear new pending
+  OSDMap::Incremental next(osdmap.get_epoch() + 1);
+  pending_inc = next;
+}
+
+void OSDMonitor::send_waiting()
+{
+  dout(10) << "send_waiting " << osdmap.get_epoch() << endl;
+
+  for (map<msg_addr_t,pair<entity_inst_t,epoch_t> >::iterator i = awaiting_map.begin();
+       i != awaiting_map.end();
+       i++)
+    send_incremental(i->second.second, i->first, i->second.first);
+}
+
+
+void OSDMonitor::send_full(msg_addr_t who, const entity_inst_t& inst)
+{
+  messenger->send_message(new MOSDMap(&osdmap), who, inst);
+}
+
+void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst)
+{
+  dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch()
+	  << " to " << dest << endl;
+  
+  MOSDMap *m = new MOSDMap;
+  
+  for (epoch_t e = osdmap.get_epoch();
+       e > since;
+       e--) {
+    bufferlist bl;
+    if (get_inc_map_bl(e, bl)) {
+      dout(10) << "osd_send_incremental    inc " << e << endl;
+      m->incremental_maps[e] = bl;
+    } 
+    else if (get_map_bl(e, bl)) {
+      dout(10) << "osd_send_incremental   full " << e << endl;
+      m->maps[e] = bl;
+    }
+    else {
+      assert(0);  // we should have all maps.
+    }
+  }
+  
+  messenger->send_message(m, dest, inst);
+}
+
+
+
+void OSDMonitor::bcast_latest_mds()
+{
+  epoch_t e = osdmap.get_epoch();
+  dout(1) << "bcast_latest_mds epoch " << e << endl;
+  
+  // tell mds
+  for (set<int>::iterator i = mon->mdsmon->mdsmap.get_mds().begin();
+       i != mon->mdsmon->mdsmap.get_mds().end();
+       i++) {
+    if (mon->mdsmon->mdsmap.is_down(*i)) continue;
+    send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i));
+  }
+}
+
+void OSDMonitor::bcast_latest_osd()
+{
+  epoch_t e = osdmap.get_epoch();
+  dout(1) << "bcast_latest_osd epoch " << e << endl;
+
+  // tell osds
+  set<int> osds;
+  osdmap.get_all_osds(osds);
+  for (set<int>::iterator it = osds.begin();
+       it != osds.end();
+       it++) {
+    if (osdmap.is_down(*it)) continue;
+    
+    send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(*it), osdmap.get_inst(*it));
+  }  
+}
+
+
+
+void OSDMonitor::tick()
+{
+  // mark down osds out?
+  utime_t now = g_clock.now();
+  list<int> mark_out;
+  for (map<int,utime_t>::iterator i = down_pending_out.begin();
+       i != down_pending_out.end();
+       i++) {
+    utime_t down = now;
+    down -= i->second;
+    
+    if (down.sec() >= g_conf.mon_osd_down_out_interval) {
+      dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl;
+      mark_out.push_back(i->first);
+    }
+  }
+  for (list<int>::iterator i = mark_out.begin();
+       i != mark_out.end();
+       i++) {
+    down_pending_out.erase(*i);
+    pending_inc.new_out.push_back( *i );
+  }
+  if (!mark_out.empty()) {
+    accept_pending();
+    
+    // hrmpf.  bcast map for now.  FIXME FIXME.
+    bcast_latest_osd();
+  }
+}
+
+void OSDMonitor::election_starting()
+{
+  dout(10) << "election_starting" << endl;
+}
+
+void OSDMonitor::election_finished()
+{
+  dout(10) << "election_starting" << endl;
+
+  state = STATE_INIT;
+
+  if (mon->is_leader()) {
+    // leader.
+    if (mon->monmap->num_mon == 1) {
+      // hmm, it's just me!
+      state = STATE_SYNC;
+    }
+  } 
+  else if (mon->is_peon()) {
+    // peon. send info
+    messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch),
+			    MSG_ADDR_MON(mon->leader), mon->monmap->get_inst(mon->leader));
+  }
+  
+}
+
+
+
+void OSDMonitor::handle_info(MMonOSDMapInfo *m)
+{
+  dout(10) << "handle_info from " << m->get_source()
+	   << " epoch " << m->get_epoch() << " in mon_epoch " << m->get_mon_epoch()
+	   << endl;
+  
+  epoch_t epoch = m->get_epoch();
+
+  // did they have anything?
+  if (epoch > 0) {
+    // make sure it's current.
+    if (epoch == osdmap.get_epoch()) {
+      if (osdmap.mon_epoch != m->get_mon_epoch()) {
+	dout(10) << "handle_info had divergent epoch " << m->get_epoch() 
+		 << ", mon_epoch " << m->get_mon_epoch() << " != " << osdmap.mon_epoch << endl;
+	epoch--;
+      }
+    } else {
+      bufferlist bl;
+      get_map_bl(epoch, bl);
+      
+      OSDMap old;
+      old.decode(bl);
+      
+      if (old.mon_epoch != m->get_mon_epoch()) {
+	dout(10) << "handle_info had divergent epoch " << m->get_epoch() 
+		 << ", mon_epoch " << m->get_mon_epoch() << " != " << old.mon_epoch << endl;
+	epoch--;
+      }
+    }
+  }
+  
+  // bring up to date
+  if (epoch < osdmap.get_epoch()) 
+    send_incremental(epoch, m->get_source(), m->get_source_inst());
+  
+  delete m;
+}
+
+
+void OSDMonitor::issue_leases()
+{
+  dout(10) << "issue_leases" << endl;
+  assert(mon->is_leader());
+
+  // set lease endpoint
+  lease_expire = g_clock.now();
+  lease_expire += g_conf.mon_lease;
+
+  pending_ack.clear();
+  
+  for (set<int>::iterator i = mon->quorum.begin();
+       i != mon->quorum.end();
+       i++) {
+    if (*i == mon->whoami) continue;
+    messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire),
+			    MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+    pending_ack.insert(*i);
+  }
+}
+
+void OSDMonitor::handle_lease(MMonOSDMapLease *m)
+{
+  if (m->get_epoch() != osdmap.get_epoch() + 1) {
+    dout(10) << "map_lease from " << m->get_source() 
+	     << " on epoch " << m->get_epoch() << ", but i am " << osdmap.get_epoch() << endl;
+    assert(0);
+    delete m;
+    return;
+  }
+  
+  dout(10) << "map_lease from " << m->get_source() << " expires " << lease_expire << endl;
+  lease_expire = m->get_lease_expire();
+  
+  delete m;
+}
+
+void OSDMonitor::handle_lease_ack(MMonOSDMapLeaseAck *m)
+{
+  // right epoch?
+  if (m->get_epoch() != osdmap.get_epoch()) {
+    dout(10) << "map_lease_ack from " << m->get_source() 
+	     << " on old epoch " << m->get_epoch() << ", dropping" << endl;
+    delete m;
+    return;
+  }
+  
+  // within time limit?
+  if (g_clock.now() >= lease_expire) {
+    dout(10) << "map_lease_ack from " << m->get_source() 
+	     << ", but lease expired, calling election" << endl;
+    mon->call_election();
+    delete m;
+    return;
+  }
+  
+  assert(m->get_source().is_mon());
+  int from = m->get_source().num();
+
+  assert(pending_ack.count(from));
+  pending_ack.erase(from);
+
+  if (pending_ack.empty()) {
+    dout(10) << "map_lease_ack from " << m->get_source() 
+	     << ", last one" << endl;
+  } else {
+    dout(10) << "map_lease_ack from " << m->get_source() 
+	     << ", still waiting on " << pending_ack << endl;
+  }
+  
+  delete m;
+}
+
+
+void OSDMonitor::update_map()
+{
+  // lock map
+  state = STATE_UPDATING;
+  pending_ack.clear();
+  
+  // set lease endpoint
+  lease_expire += g_conf.mon_lease;
+
+  // send prepare
+  epoch_t epoch = osdmap.get_epoch();
+  bufferlist map_bl, inc_map_bl;
+  if (!get_inc_map_bl(epoch, inc_map_bl))
+	get_map_bl(epoch, map_bl);
+
+  for (set<int>::iterator i = mon->quorum.begin();
+       i != mon->quorum.end();
+       i++) {
+    if (*i == mon->whoami) continue;
+    messenger->send_message(new MMonOSDMapUpdatePrepare(epoch, 
+							map_bl, inc_map_bl),
+			    MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+    pending_ack.insert(*i);
+  }
+}
+
+
+
+void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m)
+{
+  dout(10) << "map_update_prepare from " << m->get_source() << " epoch " << m->get_epoch() << endl;
+  // accept map
+  assert(m->get_epoch() == osdmap.get_epoch() + 1);
+  
+  if (m->inc_map_bl.length()) {
+    int off = 0;
+    pending_inc.decode(m->inc_map_bl, off);
+    accept_pending();
+  } else {
+    osdmap.decode(m->map_bl);
+  }
+  
+  // state
+  state = STATE_LOCK;
+  //lease_expire = m->lease_expire;
+  
+  // ack
+  messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()),
+						  m->get_source(), m->get_source_inst());
+  delete m;
+}
+
+void OSDMonitor::handle_update_ack(MMonOSDMapUpdateAck *m)
+{
+  /*
+  // right epoch?
+  if (m->get_epoch() != osdmap.get_epoch()) {
+	dout(10) << "map_update_ack from " << m->get_source() 
+			 << " on old epoch " << m->get_epoch() << ", dropping" << endl;
+	delete m;
+	return;
+  }
+
+  // within time limit?
+  if (g_clock.now() >= lease_expire) {
+	dout(10) << "map_update_ack from " << m->get_source() 
+			 << ", but lease expired, calling election" << endl;
+	state = STATE_SYNC;
+	mon->call_election();
+	return;
+  }
+
+  assert(m->get_source().is_mon());
+  int from = m->get_source().num();
+
+  assert(pending_lease_ack.count(from));
+  pending_lease_ack.erase(from);
+
+  if (pending_lease_ack.empty()) {
+	dout(10) << "map_update_ack from " << m->get_source() 
+			 << ", last one" << endl;
+	state = STATE_SYNC;
+	
+	// send lease commit
+	for (map<int>::iterator i = mon->quorum.begin();
+		 i != mon->quorum.end();
+		 i++) {
+	  if (i == mon->whoami) continue;
+	  messenger->send_message(new MMonOSDMapLeaseCommit(osdmap),
+							  MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+	}
+  } else {
+	dout(10) << "map_update_ack from " << m->get_source() 
+			 << ", still waiting on " << pending_lease_ack << endl;
+  }
+*/
+}
+
+void OSDMonitor::handle_update_commit(MMonOSDMapUpdateCommit *m)
+{
+}
diff --git a/branches/sage/cephmds2/mon/OSDMonitor.h b/branches/sage/cephmds2/mon/OSDMonitor.h
new file mode 100644
index 0000000000000..9936ecc1ff70e
--- /dev/null
+++ b/branches/sage/cephmds2/mon/OSDMonitor.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __OSDMONITOR_H
+#define __OSDMONITOR_H
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+
+class Monitor;
+
+class OSDMonitor : public Dispatcher {
+  Monitor *mon;
+  Messenger *messenger;
+  Mutex &lock;
+
+  // osd maps
+public:
+  OSDMap osdmap;
+
+private:
+  map<msg_addr_t, pair<entity_inst_t, epoch_t> > awaiting_map;
+  
+  void create_initial();
+  bool get_map_bl(epoch_t epoch, bufferlist &bl);
+  bool get_inc_map_bl(epoch_t epoch, bufferlist &bl);
+
+  void save_map();
+  void save_inc_map(OSDMap::Incremental &inc);
+
+  // [leader]
+  OSDMap::Incremental pending_inc;
+  map<int,utime_t>    down_pending_out;  // osd down -> out
+
+  set<int>            pending_ack; 
+
+  // we are distributed
+  const static int STATE_INIT = 0;     // startup
+  const static int STATE_SYNC = 1;     // sync map copy (readonly)
+  const static int STATE_LOCK = 2;     // [peon] map locked
+  const static int STATE_UPDATING = 3; // [leader] map locked, waiting for peon ack
+
+  int state;
+  utime_t lease_expire;     // when lease expires
+  
+  void init();
+
+  // maps
+  void accept_pending();   // accept pending, new map.
+  void send_waiting();     // send current map to waiters.
+  void send_full(msg_addr_t dest, const entity_inst_t& inst);
+  void send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst);
+  void bcast_latest_mds();
+  void bcast_latest_osd();
+  
+  void update_map();
+
+  void handle_osd_boot(class MOSDBoot *m);
+  void handle_osd_in(class MOSDIn *m);
+  void handle_osd_out(class MOSDOut *m);
+  void handle_osd_failure(class MOSDFailure *m);
+  void handle_osd_getmap(class MOSDGetMap *m);
+
+  void handle_info(class MMonOSDMapInfo*);
+  void handle_lease(class MMonOSDMapLease*);
+  void handle_lease_ack(class MMonOSDMapLeaseAck*);
+  void handle_update_prepare(class MMonOSDMapUpdatePrepare*);
+  void handle_update_ack(class MMonOSDMapUpdateAck*);
+  void handle_update_commit(class MMonOSDMapUpdateCommit*);
+
+ public:
+  OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) : 
+    mon(mn), messenger(m), lock(l),
+    state(STATE_SYNC) {
+    init();
+  }
+
+  void dispatch(Message *m);
+  void tick();  // check state, take actions
+
+  void election_starting();  // abort whatever.
+  void election_finished();  // reinitialize whatever.
+
+  void issue_leases();
+
+  void fake_osd_failure(int osd, bool down);
+  void fake_osdmap_update();
+  void fake_reorg();
+};
+
+#endif
diff --git a/branches/sage/cephmds2/msg/Dispatcher.cc b/branches/sage/cephmds2/msg/Dispatcher.cc
new file mode 100644
index 0000000000000..edee54a2c631f
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Dispatcher.cc
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "Dispatcher.h"
+#include "Messenger.h"
+
+#include "mds/MDS.h"
+
+/*
+int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port)
+{
+  assert(0);
+  //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER);  // on my port!
+}
+*/
diff --git a/branches/sage/cephmds2/msg/Dispatcher.h b/branches/sage/cephmds2/msg/Dispatcher.h
new file mode 100644
index 0000000000000..e6fe8d8da47ce
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Dispatcher.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __DISPATCHER_H
+#define __DISPATCHER_H
+
+#include "Message.h"
+
+class Messenger;
+
+class Dispatcher {
+ public:
+  virtual ~Dispatcher() { }
+
+  // how i receive messages
+  virtual void dispatch(Message *m) = 0;
+
+
+  // how i deal with transmission failures.
+  virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) { delete m; }
+
+  // lookups
+  virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst) { assert(0); return 0; }
+
+  // this is how i send messages
+  //int send_message(Message *m, msg_addr_t dest, int dest_port);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/msg/FakeMessenger.cc b/branches/sage/cephmds2/msg/FakeMessenger.cc
new file mode 100644
index 0000000000000..01f6301c2618e
--- /dev/null
+++ b/branches/sage/cephmds2/msg/FakeMessenger.cc
@@ -0,0 +1,379 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "Message.h"
+#include "FakeMessenger.h"
+#include "mds/MDS.h"
+
+#include "common/Timer.h"
+
+#include "common/LogType.h"
+#include "common/Logger.h"
+
+#include "config.h"
+
+#undef dout
+#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " "
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <map>
+#include <cassert>
+#include <iostream>
+
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include <pthread.h>
+
+
+// global queue.
+
+map<msg_addr_t, FakeMessenger*>      directory;
+hash_map<int, Logger*>        loggers;
+LogType fakemsg_logtype;
+
+set<msg_addr_t>           shutdown_set;
+
+Mutex lock;
+Cond  cond;
+
+bool pending_timer = false;
+
+bool      awake = false;
+bool      fm_shutdown = false;
+pthread_t thread_id;
+
+
+class C_FakeKicker : public Context {
+  void finish(int r) {
+    dout(18) << "timer kick" << endl;
+    pending_timer = true;
+    lock.Lock();
+    cond.Signal();  // why not
+    lock.Unlock();
+  }
+};
+
+void FakeMessenger::callback_kick() 
+{
+  pending_timer = true;
+  lock.Lock();
+  cond.Signal();  // why not
+  lock.Unlock();
+}
+
+void *fakemessenger_thread(void *ptr) 
+{
+  //dout(1) << "thread start, setting timer kicker" << endl;
+  //g_timer.set_messenger_kicker(new C_FakeKicker());
+  //msgr_callback_kicker = new C_FakeKicker();
+
+  lock.Lock();
+  while (1) {
+    dout(20) << "thread waiting" << endl;
+    if (fm_shutdown) break;
+    awake = false;
+    cond.Wait(lock);
+    awake = true;
+    dout(20) << "thread woke up" << endl;
+    if (fm_shutdown) break;
+
+    fakemessenger_do_loop_2();
+
+    if (directory.empty()) break;
+  }
+  lock.Unlock();
+
+  //cout << "unsetting messenger" << endl;
+  //g_timer.unset_messenger_kicker();
+  //g_timer.unset_messenger();
+  //msgr_callback_kicker = 0;
+
+  dout(1) << "thread finish (i woke up but no messages, bye)" << endl;
+  return 0;
+}
+
+
+void fakemessenger_startthread() {
+  pthread_create(&thread_id, NULL, fakemessenger_thread, 0);
+}
+
+void fakemessenger_stopthread() {
+  cout << "fakemessenger_stopthread setting stop flag" << endl;
+  lock.Lock();  
+  fm_shutdown = true;
+  lock.Unlock();
+  cond.Signal();
+  
+  fakemessenger_wait();
+}
+
+void fakemessenger_wait()
+{
+  cout << "fakemessenger_wait waiting" << endl;
+  void *ptr;
+  pthread_join(thread_id, &ptr);
+}
+
+
+
+
+// lame main looper
+
+int fakemessenger_do_loop()
+{
+  lock.Lock();
+  fakemessenger_do_loop_2();
+  lock.Unlock();
+
+  g_timer.shutdown();
+  return 0;
+}
+
+
+int fakemessenger_do_loop_2()
+{
+  //lock.Lock();
+  dout(18) << "do_loop begin." << endl;
+
+  while (1) {
+    bool didone = false;
+    
+    dout(18) << "do_loop top" << endl;
+
+    /*// timer?
+    if (pending_timer) {
+      pending_timer = false;
+      dout(5) << "pending timer" << endl;
+      g_timer.execute_pending();
+    }
+    */
+
+    // callbacks
+    lock.Unlock();
+    Messenger::do_callbacks();
+    lock.Lock();
+
+    // messages
+    map<msg_addr_t, FakeMessenger*>::iterator it = directory.begin();
+    while (it != directory.end()) {
+
+      dout(18) << "messenger " << it->second << " at " << MSG_ADDR_NICE(it->first) << " has " << it->second->num_incoming() << " queued" << endl;
+
+      FakeMessenger *mgr = it->second;
+
+      if (!mgr->is_ready()) {
+        dout(18) << "messenger " << it->second << " at " << MSG_ADDR_NICE(it->first) << " has no dispatcher, skipping" << endl;
+        it++;
+        continue;
+      }
+
+      Message *m = mgr->get_message();
+      it++;
+      
+      if (m) {
+        //dout(18) << "got " << m << endl;
+        dout(1) << "---- '" << m->get_type_name() 
+                << "' from " << MSG_ADDR_NICE(m->get_source()) // << ':' << m->get_source_port() 
+                << " to " << MSG_ADDR_NICE(m->get_dest()) //<< ':' << m->get_dest_port() 
+                << " ---- " << m 
+                << endl;
+        
+        if (g_conf.fakemessenger_serialize) {
+          // encode
+          if (m->empty_payload()) 
+            m->encode_payload();
+          msg_envelope_t env = m->get_envelope();
+          bufferlist bl;
+          bl.claim( m->get_payload() );
+          //bl.c_str();   // condense into 1 buffer
+
+          delete m;
+          
+          // decode
+          m = decode_message(env, bl);
+          assert(m);
+        } 
+        
+        didone = true;
+
+        lock.Unlock();
+        mgr->dispatch(m);
+        lock.Lock();
+      }
+    }
+    
+    // deal with shutdowns.. dleayed to avoid concurrent directory modification
+    if (!shutdown_set.empty()) {
+      for (set<msg_addr_t>::iterator it = shutdown_set.begin();
+           it != shutdown_set.end();
+           it++) {
+        dout(7) << "fakemessenger: removing " << *it << " from directory" << endl;
+        assert(directory.count(*it));
+        directory.erase(*it);
+        if (directory.empty()) {
+          dout(1) << "fakemessenger: last shutdown" << endl;
+          ::fm_shutdown = true;
+        }
+      }
+      shutdown_set.clear();
+    }
+    
+    if (!didone)
+      break;
+  }
+
+
+  dout(18) << "do_loop end (no more messages)." << endl;
+  //lock.Unlock();
+  return 0;
+}
+
+
+FakeMessenger::FakeMessenger(msg_addr_t me)  : Messenger(me)
+{
+  myaddr = me;
+  lock.Lock();
+  directory[ myaddr ] = this;
+  lock.Unlock();
+
+  cout << "fakemessenger " << myaddr << " messenger is " << this << endl;
+
+  //g_timer.set_messenger(this);
+
+  qlen = 0;
+
+  /*
+  string name;
+  name = "m.";
+  name += MSG_ADDR_TYPE(myaddr);
+  int w = MSG_ADDR_NUM(myaddr);
+  if (w >= 1000) name += ('0' + ((w/1000)%10));
+  if (w >= 100) name += ('0' + ((w/100)%10));
+  if (w >= 10) name += ('0' + ((w/10)%10));
+  name += ('0' + ((w/1)%10));
+
+  loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype);
+  */
+}
+
+FakeMessenger::~FakeMessenger()
+{
+
+}
+
+
+int FakeMessenger::shutdown()
+{
+  //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl;
+  lock.Lock();
+  assert(directory.count(myaddr) == 1);
+  shutdown_set.insert(myaddr);
+  
+  /*
+  directory.erase(myaddr);
+  if (directory.empty()) {
+    dout(1) << "fakemessenger: last shutdown" << endl;
+    ::fm_shutdown = true;
+    cond.Signal();  // why not
+  } 
+  */
+
+  /*
+  if (loggers[myaddr]) {
+    delete loggers[myaddr];
+    loggers.erase(myaddr);
+  }
+  */
+
+  lock.Unlock();
+  return 0;
+}
+
+/*
+void FakeMessenger::trigger_timer(Timer *t) 
+{
+  // note timer to call
+  pending_timer = t;
+
+  // wake up thread?
+  cond.Signal();  // why not
+}
+*/
+
+int FakeMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+  m->set_source(myaddr, fromport);
+  m->set_dest(dest, port);
+  //m->set_lamport_send_stamp( get_lamport() );
+
+  entity_inst_t blank;
+  m->set_source_inst(blank);
+
+  lock.Lock();
+
+  // deliver
+  try {
+#ifdef LOG_MESSAGES
+    // stats
+    loggers[myaddr]->inc("+send",1);
+    loggers[dest]->inc("-recv",1);
+
+    char s[20];
+    sprintf(s,"+%s", m->get_type_name());
+    loggers[myaddr]->inc(s);
+    sprintf(s,"-%s", m->get_type_name());
+    loggers[dest]->inc(s);
+#endif
+
+    // queue
+    FakeMessenger *dm = directory[dest];
+    if (!dm) {
+      dout(1) << "** destination " << MSG_ADDR_NICE(dest) << " (" << dest << ") dne" << endl;
+      assert(dm);
+    }
+    dm->queue_incoming(m);
+
+    dout(1) << "--> " << myaddr << " sending " << m << " '" << m->get_type_name() << "'"
+            << " to " << MSG_ADDR_NICE(dest) 
+            << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl;
+    
+  }
+  catch (...) {
+    cout << "no destination " << dest << endl;
+    assert(0);
+  }
+
+
+  // wake up loop?
+  if (!awake) {
+    dout(10) << "waking up fakemessenger thread" << endl; 
+    cond.Signal();
+    lock.Unlock();
+  } else
+    lock.Unlock();
+  
+  return 0;
+}
+
+
diff --git a/branches/sage/cephmds2/msg/FakeMessenger.h b/branches/sage/cephmds2/msg/FakeMessenger.h
new file mode 100644
index 0000000000000..51bec779c4366
--- /dev/null
+++ b/branches/sage/cephmds2/msg/FakeMessenger.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __FAKEMESSENGER_H
+#define __FAKEMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+
+#include <list>
+#include <map>
+
+class Timer;
+
+class FakeMessenger : public Messenger {
+ protected:
+  msg_addr_t myaddr;
+
+  class Logger *logger;
+
+  int    qlen;
+  list<Message*>       incoming;        // incoming queue
+
+ public:
+  FakeMessenger(msg_addr_t me);
+  ~FakeMessenger();
+
+  virtual int shutdown();
+
+  // msg interface
+  virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+  
+  // events
+  //virtual void trigger_timer(Timer *t);
+
+  int get_dispatch_queue_len() { return qlen; }
+
+  void callback_kick();
+
+  // -- incoming queue --
+  // (that nothing uses)
+  Message *get_message() {
+    if (!incoming.empty()) {
+      Message *m = incoming.front();
+      incoming.pop_front();
+      qlen--;
+      return m;
+    }
+    return NULL;
+  }
+  bool queue_incoming(Message *m) {
+    incoming.push_back(m);
+    qlen++;
+    return true;
+  }
+  int num_incoming() {
+    //return incoming.size();
+    return qlen;
+  }
+
+};
+
+int fakemessenger_do_loop();
+int fakemessenger_do_loop_2();
+void fakemessenger_startthread();
+void fakemessenger_stopthread();
+void fakemessenger_wait();
+
+#endif
diff --git a/branches/sage/cephmds2/msg/HostMonitor.cc b/branches/sage/cephmds2/msg/HostMonitor.cc
new file mode 100644
index 0000000000000..33bef09565df2
--- /dev/null
+++ b/branches/sage/cephmds2/msg/HostMonitor.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "HostMonitor.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MFailure.h"
+#include "messages/MFailureAck.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#define DBL  10
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << whoami << " hostmon: "
+
+
+// timer contexts
+
+class C_HM_InitiateHeartbeat : public Context {
+  HostMonitor *hm;
+public:
+  C_HM_InitiateHeartbeat(HostMonitor *hm) {
+     this->hm = hm;
+  }
+  void finish(int r) {
+    //cout << "HEARTBEAT" << endl;
+    hm->pending_events.erase(this);
+    hm->initiate_heartbeat();
+  }
+};
+
+class C_HM_CheckHeartbeat : public Context {
+  HostMonitor *hm;
+public:
+  C_HM_CheckHeartbeat(HostMonitor *hm) {
+    this->hm = hm;
+  }
+  void finish(int r) {
+    //cout << "CHECK" << endl;
+    hm->pending_events.erase(this);
+    hm->check_heartbeat();
+  }
+};
+
+
+
+// startup/shutdown
+
+void HostMonitor::init()
+{
+  dout(DBL) << "init" << endl;
+
+  // hack params for now
+  heartbeat_interval = 10;
+  max_ping_time = 2;
+  max_heartbeat_misses = 3;
+  notify_retry_interval = 10;
+  
+  // schedule first hb
+  schedule_heartbeat();
+}
+
+
+void HostMonitor::shutdown()
+{
+  // cancel any events
+  for (set<Context*>::iterator it = pending_events.begin();
+       it != pending_events.end();
+       it++) {
+    g_timer.cancel_event(*it);
+    delete *it;
+  }
+  pending_events.clear();
+}
+
+
+// schedule next heartbeat
+
+void HostMonitor::schedule_heartbeat()
+{
+  dout(DBL) << "schedule_heartbeat" << endl;
+  Context *e = new C_HM_InitiateHeartbeat(this);
+  pending_events.insert(e);
+  g_timer.add_event_after(heartbeat_interval, e);
+}
+
+
+// take note of a live host
+
+void HostMonitor::host_is_alive(msg_addr_t host)
+{
+  if (hosts.count(host))
+    status[host].last_heard_from = g_clock.gettime();
+}
+
+
+// do heartbeat
+
+void HostMonitor::initiate_heartbeat()
+{
+  time_t now = g_clock.gettime();
+  
+  // send out pings
+  inflight_pings.clear();
+  for (set<msg_addr_t>::iterator it = hosts.begin();
+       it != hosts.end();
+       it++) {
+    // have i heard from them recently?
+    if (now - status[*it].last_heard_from < heartbeat_interval) {
+      dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl;
+    } else {
+      dout(DBL) << "pinging " << *it << endl;
+      status[*it].last_pinged = now;
+      inflight_pings.insert(*it);
+
+      messenger->send_message(new MPing(1), *it, 0);
+    }
+  }
+  
+  // set timer to check results
+  Context *e = new C_HM_CheckHeartbeat(this);
+  pending_events.insert(e);
+  g_timer.add_event_after(max_ping_time, e);
+  dout(10) << "scheduled check " << e << endl;
+
+  schedule_heartbeat();  // schedule next heartbeat
+}
+
+
+// check results
+
+void HostMonitor::check_heartbeat()
+{
+  dout(DBL) << "check_heartbeat()" << endl;
+
+  // check inflight pings
+  for (set<msg_addr_t>::iterator it = inflight_pings.begin();
+       it != inflight_pings.end();
+       it++) {
+    status[*it].num_heartbeats_missed++;
+
+    dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl;
+    
+    if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) {
+      if (acked_failures.count(*it)) {
+        dout(DBL) << *it << " is already failed" << endl;
+      } else {
+        if (unacked_failures.count(*it)) {
+          dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl;
+        } else {
+          dout(DBL) << "failing " << *it << endl;
+          unacked_failures.insert(*it);
+        }
+        
+        /*if (false)   // do this in NewMessenger for now!  FIXME
+        for (set<msg_addr_t>::iterator nit = notify.begin();
+             nit != notify.end();
+             nit++) {
+          messenger->send_message(new MFailure(*it, messenger->get_inst(*it)),
+                                  *nit, notify_port, 0);
+        }
+        */
+      }
+    }
+  }
+ 
+  // forget about the pings.
+  inflight_pings.clear();
+}
+
+
+// incoming messages
+
+void HostMonitor::proc_message(Message *m)
+{
+  switch (m->get_type()) {
+
+  case MSG_PING_ACK:
+    handle_ping_ack((MPingAck*)m);
+    break;
+
+  case MSG_FAILURE_ACK:
+    handle_failure_ack((MFailureAck*)m);
+    break;
+
+  }
+}
+
+void HostMonitor::handle_ping_ack(MPingAck *m)
+{
+  msg_addr_t from = m->get_source();
+
+  dout(DBL) << "ping ack from " << from << endl;
+  status[from].last_pinged = g_clock.gettime();
+  status[from].num_heartbeats_missed = 0;
+  inflight_pings.erase(from);
+
+  delete m;
+}
+
+void HostMonitor::handle_failure_ack(MFailureAck *m)
+{
+
+  // FIXME: this doesn't handle failed -> alive transitions gracefully at all..
+
+  // the higher-up's acknowledged our failure notification, we can stop resending it.
+  msg_addr_t failed = m->get_failed();
+  dout(DBL) << "handle_failure_ack " << failed << endl;
+  unacked_failures.erase(failed);
+  acked_failures.insert(failed);
+
+  delete m;
+}
+
+
diff --git a/branches/sage/cephmds2/msg/HostMonitor.h b/branches/sage/cephmds2/msg/HostMonitor.h
new file mode 100644
index 0000000000000..20ef24eff8daf
--- /dev/null
+++ b/branches/sage/cephmds2/msg/HostMonitor.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __HOSTMONITOR_H
+#define __HOSTMONITOR_H
+
+#include <time.h>
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/Context.h"
+#include "msg/Message.h"
+
+class Message;
+class Messenger;
+
+typedef struct {
+  time_t last_heard_from;
+  time_t last_pinged;
+  int    num_heartbeats_missed;
+} monitor_rec_t;
+
+class HostMonitor {
+  Messenger *messenger;
+  string whoami;
+
+  // hosts i monitor
+  set<msg_addr_t>  hosts;
+
+  // who i tell when they fail
+  set<msg_addr_t>  notify;
+  int              notify_port;
+
+  // their status
+  map<msg_addr_t,monitor_rec_t>  status;
+
+  set<msg_addr_t>  inflight_pings;    // pings we sent that haven't replied yet
+
+  set<msg_addr_t>  unacked_failures;  // failed hosts that haven't been acked yet.
+  set<msg_addr_t>  acked_failures;    // these failures have been acked.
+
+  float heartbeat_interval;    // how often to do a heartbeat
+  float max_ping_time;         // how long before it's a miss
+  int   max_heartbeat_misses;  // how many misses before i tell
+  float notify_retry_interval; // how often to retry failure notification
+
+ public:
+  set<Context*>  pending_events;
+
+ private:
+  void schedule_heartbeat();
+
+ public:
+  HostMonitor(Messenger *m, string& whoami) {
+    this->messenger = m;
+    this->whoami = whoami;
+    notify_port = 0;
+  }
+  set<msg_addr_t>& get_hosts() { return hosts; }
+  set<msg_addr_t>& get_notify() { return notify; }
+  void set_notify_port(int p) { notify_port = p; }
+
+  void remove_host(msg_addr_t h) {
+    hosts.erase(h);
+    status.erase(h);
+    unacked_failures.erase(h);
+    acked_failures.erase(h);
+  }
+
+  void init();
+  void shutdown();
+  
+  void host_is_alive(msg_addr_t who);
+
+  void proc_message(Message *m);
+  void handle_ping_ack(class MPingAck *m);
+  void handle_failure_ack(class MFailureAck *m);
+
+  void initiate_heartbeat();
+  void check_heartbeat();
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/msg/MPIMessenger.cc b/branches/sage/cephmds2/msg/MPIMessenger.cc
new file mode 100644
index 0000000000000..3dfcd3224a4b9
--- /dev/null
+++ b/branches/sage/cephmds2/msg/MPIMessenger.cc
@@ -0,0 +1,608 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "config.h"
+#include "include/error.h"
+
+#include "common/Timer.h"
+#include "common/Mutex.h"
+
+#include "MPIMessenger.h"
+#include "Message.h"
+
+#include <iostream>
+#include <cassert>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <unistd.h>
+#include <mpi.h>
+
+/*
+ * We make a directory, so that we can have multiple Messengers in the
+ * same process (rank).  This is useful for benchmarking and creating lots of 
+ * simulated clients, e.g.
+ */
+
+hash_map<int, MPIMessenger*>  directory;
+list<Message*>                outgoing, incoming;
+list<MPI_Request*>            unfinished_sends;
+map<MPI_Request*, Message*>   unfinished_send_message;
+
+/* this process */
+int mpi_world;
+int mpi_rank;
+bool mpi_done = false;     // set this flag to stop the event loop
+
+
+#define FUNNEL_MPI         // if we want to funnel mpi through a single thread
+#define TAG_UNSOLICITED 0
+#define DBLVL 18
+
+// the key used to fetch the tag for the current thread.
+pthread_key_t tag_key;
+pthread_t thread_id = 0;   // thread id of the event loop.  init value == nobody
+
+Mutex sender_lock;
+Mutex out_queue_lock;
+
+bool pending_timer;
+
+
+// our lock for any common data; it's okay to have only the one global mutex
+// because our common data isn't a whole lot.
+//static pthread_mutex_t mutex;
+
+// the number of distinct threads we've seen so far; used to generate
+// a unique tag for each thread.
+//static int nthreads = 10;
+
+//#define TAG_UNSOLICITED 0
+
+// debug
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] "
+
+
+
+/*****
+ * MPI global methods for process-wide startup, shutdown.
+ */
+
+int mpimessenger_init(int& argc, char**& argv)
+{
+  MPI_Init(&argc, &argv);
+  
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+  char hostname[100];
+  gethostname(hostname,100);
+  int pid = getpid();
+
+  dout(12) << "init: i am " << hostname << " pid " << pid << endl;
+  
+  assert(mpi_world > g_conf.num_osd+g_conf.num_mds);
+
+  return mpi_rank;
+}
+
+int mpimessenger_shutdown() 
+{
+  dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl;
+  MPI_Barrier (MPI_COMM_WORLD);
+  dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl;
+  MPI_Finalize();
+  return 0;
+}
+
+int mpimessenger_world()
+{
+  return mpi_world;
+}
+
+
+
+/***
+ * internal send/recv
+ */
+
+
+/*
+ * get fresh MPI_Request* (on heap) for a new async MPI_Isend
+ */
+
+MPI_Request *mpi_prep_send_req() {
+  MPI_Request *req = new MPI_Request;
+  unfinished_sends.push_back(req);
+  dout(DBLVL) << "prep_send_req " << req << endl;
+  return req;
+}
+
+
+/*
+ * clean up MPI_Request*'s for Isends that have completed.
+ * also, hose any associated Message*'s for Messages that are completely sent.
+ *
+ * if wait=true, block and wait for sends to finish.
+ */
+
+void mpi_reap_sends(bool wait=false) {
+  sender_lock.Lock();
+
+  list<MPI_Request*>::iterator it = unfinished_sends.begin();
+  while (it != unfinished_sends.end()) {
+    MPI_Status status;
+    int flag;
+    
+    if (wait) {
+      MPI_Wait(*it, &status);
+    } else {
+      MPI_Test(*it, &flag, &status);
+      if (!flag) break;   // not finished yet
+    }
+
+    dout(DBLVL) << "send " << *it << " completed" << endl;
+
+    if (unfinished_send_message.count(*it)) {
+      dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl;
+      delete unfinished_send_message[*it];
+      unfinished_send_message.erase(*it);
+    }
+
+    delete *it;
+    it++;
+    unfinished_sends.pop_front();
+  }
+
+  dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl;
+
+  sender_lock.Unlock();
+}
+
+
+void mpi_finish_sends() {
+  mpi_reap_sends(true);
+}
+
+
+/*
+ * recv a Message*
+ */
+Message *mpi_recv(int tag)
+{
+  // envelope
+  dout(DBLVL) << "mpi_recv waiting for message tag " << tag  << endl;
+
+  MPI_Status status;
+  msg_envelope_t env;
+  
+  ASSERT(MPI_Recv((void*)&env,
+                  sizeof(env),
+                  MPI_CHAR, 
+                  MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE,
+                  tag,
+                  MPI_COMM_WORLD,
+                  &status/*,
+                           &recv_env_req*/) == MPI_SUCCESS);
+  assert(status.count == MSG_ENVELOPE_LEN);
+
+  if (env.type == 0) {
+    dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl;
+    return 0;
+  }
+
+  dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl;
+
+  // payload
+  bufferlist blist;
+  for (int i=0; i<env.nchunks; i++) {
+    MPI_Status fragstatus;
+    ASSERT(MPI_Probe(status.MPI_SOURCE,
+                     tag,
+                     MPI_COMM_WORLD,
+                     &fragstatus) == MPI_SUCCESS);
+
+    bufferptr bp = new buffer(fragstatus.count);
+    
+    ASSERT(MPI_Recv(bp.c_str(),
+                    fragstatus.count,
+                    MPI_CHAR, 
+                    status.MPI_SOURCE,
+                    tag,
+                    MPI_COMM_WORLD,
+                    &fragstatus) == MPI_SUCCESS);
+
+    blist.push_back(bp);
+
+    dout(DBLVL) << "mpi_recv got frag " << i << " of " << env.nchunks << " len " << fragstatus.count << endl;
+  }
+  
+  dout(DBLVL) << "mpi_recv got " << blist.length() << " byte message tag " << status.MPI_TAG << endl;
+
+  // unmarshall message
+  Message *m = decode_message(env, blist);
+  return m;
+}
+
+
+/*
+ * send a Message* over the wire.  ** do not block **.
+ */
+int mpi_send(Message *m, int tag)
+{
+  int rank = MPI_DEST_TO_RANK(m->get_dest(), mpi_world);
+
+  // local?
+  if (rank == mpi_rank) {      
+    dout(DBLVL) << "queuing local delivery" << endl;
+    incoming.push_back(m);
+    return 0;
+  } 
+
+  // marshall
+  if (m->empty_payload())
+    m->encode_payload();
+  msg_envelope_t *env = &m->get_envelope();
+  env->nchunks = m->get_payload().buffers().size();
+
+  dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl;
+
+#ifndef FUNNEL_MPI
+  sender_lock.Lock();
+#endif
+
+  // send envelope
+  ASSERT(MPI_Isend((void*)env,
+                   sizeof(*env),
+                   MPI_CHAR,
+                   rank,
+                   tag,
+                   MPI_COMM_WORLD,
+                   mpi_prep_send_req()) == MPI_SUCCESS);
+
+  // payload
+  int i = 0;
+  for (list<bufferptr>::iterator it = m->get_payload().buffers().begin();
+       it != m->get_payload().buffers().end();
+       it++) {
+    dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl;
+    //MPI_Request *req = new MPI_Request;
+    ASSERT(MPI_Isend((void*)(*it).c_str(),
+                     (*it).length(),
+                     MPI_CHAR,
+                     rank,
+                     tag,
+                     MPI_COMM_WORLD,
+                     mpi_prep_send_req()) == MPI_SUCCESS);
+    i++;
+  }
+
+  // attach message to last send, so we can free it later
+  MPI_Request *req = unfinished_sends.back();
+  unfinished_send_message[req] = m;
+  
+  dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl;
+
+#ifndef FUNNEL_MPI
+  sender_lock.Unlock();
+#endif
+  return 0;
+}
+
+
+
+// get the tag for this thread
+
+#ifndef FUNNEL_MPI
+static int get_thread_tag()
+{
+  int tag = (int)pthread_getspecific(tag_key);
+  
+  if (tag == 0) {
+    // first time this thread has performed MPI messaging
+    
+    if (pthread_mutex_lock(&mutex) < 0)
+      SYSERROR();
+    
+    tag = ++nthreads;
+    
+    if (pthread_mutex_unlock(&mutex) < 0)
+      SYSERROR();
+    
+    if (pthread_setspecific(tag_key, (void*)tag) < 0)
+      SYSERROR();
+  }
+  
+  return tag;
+}
+#endif
+
+
+
+// recv event loop, for unsolicited messages.
+
+void* mpimessenger_loop(void*)
+{
+  dout(5) << "mpimessenger_loop start pid " << getpid() << endl;
+
+  while (1) {
+
+    // outgoing
+    mpi_reap_sends();
+    
+#ifdef FUNNEL_MPI
+    // check outgoing queue
+    out_queue_lock.Lock();
+    if (outgoing.size()) {
+      dout(10) << outgoing.size() << " outgoing messages" << endl;
+      for (list<Message*>::iterator it = outgoing.begin();
+           it != outgoing.end();
+           it++) {
+        mpi_send(*it, TAG_UNSOLICITED);
+      }
+    }
+    outgoing.clear();
+    out_queue_lock.Unlock();
+#endif
+
+
+    // timer events?
+    if (pending_timer) {
+      dout(DBLVL) << "pending timer" << endl;
+      g_timer.execute_pending();
+    }
+
+    // done?
+    if (mpi_done &&
+        incoming.empty() &&
+        outgoing.empty() &&
+        !pending_timer) break;
+
+
+    // incoming
+    Message *m = 0;
+
+    if (incoming.size()) {
+      dout(12) << "loop pulling message off incoming" << endl;
+      m = incoming.front();
+      incoming.pop_front();
+    } 
+    else {
+      // check mpi
+      dout(12) << "loop waiting for incoming messages" << endl;
+
+      // get message
+      m = mpi_recv(TAG_UNSOLICITED);
+    }
+
+    // dispatch?
+    if (m) {
+      int dest = m->get_dest();
+      if (directory.count(dest)) {
+        Messenger *who = directory[ dest ];
+        
+        dout(4) << "---- '" << m->get_type_name() << 
+          "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+          " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " 
+                << m 
+                << endl;
+        
+        who->dispatch(m);
+      } else {
+        dout (1) << "---- i don't know who " << dest << " is." << endl;
+        assert(0);
+        break;
+      }
+    }
+
+  }
+
+  dout(5) << "finishing async sends" << endl;
+  mpi_finish_sends();
+
+  g_timer.shutdown();
+
+  dout(5) << "mpimessenger_loop exiting loop" << endl;
+  return 0;
+}
+
+
+// start/stop mpi receiver thread (for unsolicited messages)
+int mpimessenger_start()
+{
+  dout(5) << "starting thread" << endl;
+  
+  // start a thread
+  pthread_create(&thread_id, 
+                 NULL, 
+                 mpimessenger_loop, 
+                 0);
+  return 0;
+}
+
+
+/*
+ * kick and wake up _loop (to pick up new outgoing message, or quit)
+ */
+
+MPI_Request    kick_req;
+msg_envelope_t kick_env;
+
+void mpimessenger_kick_loop()
+{
+  // if we're same thread as the loop, no kicking necessary
+  if (pthread_self() == thread_id) return;   
+
+  kick_env.type = 0;
+
+  sender_lock.Lock();
+  ASSERT(MPI_Isend(&kick_env,               // kick sync for now, but ONLY because it makes me feel safer.
+                   sizeof(kick_env),
+                   MPI_CHAR,
+                   mpi_rank,
+                   TAG_UNSOLICITED,
+                   MPI_COMM_WORLD,
+                   mpi_prep_send_req()) == MPI_SUCCESS);
+  sender_lock.Unlock();
+}
+
+
+// stop thread
+
+void mpimessenger_stop()
+{
+  dout(5) << "mpimessenger_stop stopping thread" << endl;
+
+  if (mpi_done) {
+    dout(1) << "mpimessenger_stop called, but already done!" << endl;
+    assert(!mpi_done);
+  }
+
+  // set finish flag
+  mpi_done = true;
+  mpimessenger_kick_loop();
+  
+  // wait for thread to stop
+  mpimessenger_wait();
+}
+
+
+// wait for thread to finish
+
+void mpimessenger_wait()
+{
+  void *returnval;
+  dout(10) << "mpimessenger_wait waiting for thread to finished." << endl;
+  pthread_join(thread_id, &returnval);
+  dout(10) << "mpimessenger_wait thread finished." << endl;
+}
+
+
+
+
+/***********
+ * MPIMessenger class implementation
+ */
+
+class C_MPIKicker : public Context {
+  void finish(int r) {
+    dout(DBLVL) << "timer kick" << endl;
+    mpimessenger_kick_loop();
+  }
+};
+
+MPIMessenger::MPIMessenger(msg_addr_t myaddr) : Messenger(myaddr)
+{
+  // my address
+  this->myaddr = myaddr;
+
+  // register myself in the messenger directory
+  directory[myaddr] = this;
+
+  // register to execute timer events
+  g_timer.set_messenger_kicker(new C_MPIKicker());
+
+  // logger
+  /*
+  string name;
+  name = "m.";
+  name += MSG_ADDR_TYPE(whoami);
+  int w = MSG_ADDR_NUM(whoami);
+  if (w >= 1000) name += ('0' + ((w/1000)%10));
+  if (w >= 100) name += ('0' + ((w/100)%10));
+  if (w >= 10) name += ('0' + ((w/10)%10));
+  name += ('0' + ((w/1)%10));
+
+  logger = new Logger(name, (LogType*)&mpimsg_logtype);
+  loggers[ whoami ] = logger;
+  */
+}
+
+MPIMessenger::~MPIMessenger()
+{
+  //delete logger;
+}
+
+
+int MPIMessenger::shutdown()
+{
+  // remove me from the directory
+  directory.erase(myaddr);
+
+  // no more timer events
+  g_timer.unset_messenger_kicker();
+
+  // last one?
+  if (directory.empty()) {
+    dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl;
+    pthread_t whoami = pthread_self();
+
+    dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl;
+    if (whoami == thread_id) {
+      // i am the event loop thread, just set flag!
+      dout(15) << "  set mpi_done=true" << endl;
+      mpi_done = true;
+    } else {
+      // i am a different thread, tell the event loop to stop.
+      dout(15) << "  calling mpimessenger_stop()" << endl;
+      mpimessenger_stop();
+    }
+  } else {
+    dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl;
+  }
+  return 0;
+}
+
+
+
+
+/***
+ * public messaging interface
+ */
+
+
+/* note: send_message _MUST_ be non-blocking */
+int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+  // set envelope
+  m->set_source(myaddr, fromport);
+  m->set_dest(dest, port);
+
+#ifdef FUNNEL_MPI
+
+  // queue up
+  out_queue_lock.Lock();
+  dout(DBLVL) << "queuing outgoing message " << *m << endl;
+  outgoing.push_back(m);
+  out_queue_lock.Unlock();
+
+  mpimessenger_kick_loop();
+  
+#else
+
+  // send in this thread
+  mpi_send(m, m->get_pcid());
+
+#endif
+  return 0;
+}
+
+
+
+
+
+
diff --git a/branches/sage/cephmds2/msg/MPIMessenger.h b/branches/sage/cephmds2/msg/MPIMessenger.h
new file mode 100644
index 0000000000000..d050f5bf49470
--- /dev/null
+++ b/branches/sage/cephmds2/msg/MPIMessenger.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MPIMESSENGER_H
+#define __MPIMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define MPI_DEST_TO_RANK(dest,world)    ((dest)<(NUMMDS+NUMOSD) ? \
+                                         (dest) : \
+                                         ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD))))
+
+class Timer;
+
+class MPIMessenger : public Messenger {
+ protected:
+  msg_addr_t myaddr;     // my address
+  //class Logger *logger;  // for logging
+  
+ public:
+  MPIMessenger(msg_addr_t myaddr);
+  ~MPIMessenger();
+
+  // init, shutdown MPI and associated event loop thread.
+  virtual int shutdown();
+
+  // message interface
+  virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+};
+
+/**
+ * these are all ONE per process.
+ */
+extern int mpimessenger_world();   // get world size
+extern int mpimessenger_init(int& argc, char**& argv);   // init mpi
+extern int mpimessenger_start();   // start thread
+extern void mpimessenger_stop();    // stop thread.
+extern void mpimessenger_wait();    // wait for thread to finish.
+extern int mpimessenger_shutdown();   // finalize MPI
+
+
+#endif
diff --git a/branches/sage/cephmds2/msg/MTMessenger.cc b/branches/sage/cephmds2/msg/MTMessenger.cc
new file mode 100644
index 0000000000000..301915a336ea5
--- /dev/null
+++ b/branches/sage/cephmds2/msg/MTMessenger.cc
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <pthread.h>
+#include "mpi.h"
+
+#include "include/config.h"
+#include "include/error.h"
+#include "Messenger.h"
+#include "MTMessenger.h"
+
+// This module uses MPI to implement a blocking sendrecv function that
+// feels more like a procedure call and less like event processesing.
+//
+// Threads are not independently addressable in MPI, only processes
+// are.  However, MPI does include a user defined tag in the message
+// envelope, and a reader may selectively read only messages with a
+// matching tag.  The modules assign an integer to each thread to use
+// as the tag.
+//
+
+// our lock for any common data; it's okay to have only the one global mutex
+// because our common data isn't a whole lot.
+static pthread_mutex_t mutex;
+
+// the key used to fetch the tag for the current thread.
+pthread_key_t tag_key;
+
+// the number of distinct threads we've seen so far; used to generate
+// a unique tag for each thread.
+static int nthreads;
+
+// the MPI identity of this process
+static int mpi_rank;
+
+
+// get the tag for this thread
+static int get_tag()
+{
+    int tag = (int)pthread_getspecific(tag_key);
+
+    if (tag == 0) {
+    // first time this thread has performed MPI messaging
+
+    if (pthread_mutex_lock(&mutex) < 0)
+        SYSERROR();
+
+    tag = ++nthreads;
+
+    if (pthread_mutex_unlock(&mutex) < 0)
+        SYSERROR();
+
+    if (pthread_setspecific(tag_key, (void*)tag) < 0)
+        SYSERROR();
+    }
+
+    return tag;
+}
+
+
+// marshall a message and send it over MPI
+static void send(Message *m, int rank, int tag)
+{
+    // marshall the message
+    crope r;
+    m->encode(r);
+    int size = r.length();
+
+    char *buf = (char*)r.c_str();
+    ASSERT(MPI_Send(buf,
+            size,
+            MPI_CHAR,
+            rank,
+            tag,
+            MPI_COMM_WORLD) == MPI_SUCCESS);
+}
+
+// read a message from MPI and unmarshall it
+static Message *receive(int tag)
+{
+    MPI_Status status;
+
+    // get message size
+    ASSERT(MPI_Probe(MPI_ANY_SOURCE, 
+             tag,
+             MPI_COMM_WORLD,
+             &status) == MPI_SUCCESS);
+
+    // get message; there may be multiple messages on the queue, we
+    // need to be sure to read the one which corresponds to size
+    // obtained above.
+    char *buf = new char[status.count];
+    ASSERT(MPI_Recv(buf,
+            status.count,
+            MPI_CHAR, 
+            status.MPI_SOURCE,
+            status.MPI_TAG,
+            MPI_COMM_WORLD,
+            &status) == MPI_SUCCESS);
+
+    // unmarshall message
+    crope r(buf, status.count);
+    delete[] buf;
+    Message *m = decode_message(r);
+
+    return m;
+}
+
+MTMessenger::MTMessenger(int& argc, char**& argv)
+{
+    // setup MPI; MPI errors will probably invoke the default MPI error
+    // handler, which aborts the program with a friendly message rather
+    // than returning from a function; just in case, we abort the
+    // program if we get an MPI error.
+
+    int provided;
+    ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided)
+       == MPI_SUCCESS);
+
+    ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS);
+
+    if (pthread_mutex_init(&mutex, NULL) < 0)
+    SYSERROR();
+
+    if (pthread_key_create(&tag_key, NULL) < 0)
+    SYSERROR();
+
+    nthreads = 0;
+}
+
+MTMessenger::~MTMessenger()
+{
+    // ignore shutdown errors
+
+    pthread_key_delete(tag_key);
+
+    pthread_mutex_destroy(&mutex);
+
+    MPI_Finalize();
+}
+
+// send a request and wait for the response
+Message *MTMessenger::sendrecv(Message *m, msg_addr_t dest)
+{
+    int dest_tag = 0;        // servers listen for any tag
+    int my_tag = get_tag();
+
+    // set our envelope (not to be confused with the MPI envelope)
+    m->set_source(mpi_rank, my_tag);
+    m->set_dest(dest, dest_tag);
+
+    send(m, dest, dest_tag);
+
+    return receive(my_tag);
+}
+
+// receive a request from anyone
+Message *MTMessenger::recvreq()
+{
+    return receive(MPI_ANY_TAG);
+}
+
+// forward request, masquerading as original source
+void MTMessenger::fwdreq(Message *req, int dest)
+{
+    int dest_tag = 0;        // servers listen for any tag
+
+    // set our envelope (not to be confused with the MPI envelope)
+    req->set_dest(dest, dest_tag);
+
+    send(req, dest, dest_tag);
+}
+
+// send a response to the originator of the request
+void MTMessenger::sendresp(Message *req, Message *resp)
+{
+    int req_rank = req->get_source();
+    int req_tag = req->get_source_port();
+    int my_tag = get_tag();
+
+    // set our envelope (not to be confused with the MPI envelope)
+    resp->set_source(mpi_rank, my_tag);
+    resp->set_dest(req_rank, req_tag);
+
+    send(resp, req_rank, req_tag);
+}
diff --git a/branches/sage/cephmds2/msg/MTMessenger.h b/branches/sage/cephmds2/msg/MTMessenger.h
new file mode 100644
index 0000000000000..6489de407ba2f
--- /dev/null
+++ b/branches/sage/cephmds2/msg/MTMessenger.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __MTMESSENGER_H
+#define __MTMESSENGER_H
+
+#include "Message.h"
+#include "SerialMessenger.h"
+
+// Marshall and unmarshall OBFS messages, send and receive them over
+// MPI.
+
+class MTMessenger
+{
+public:
+    // sets up the queues and internal thread; the MPI initialization
+    // will scan argc/argv for MPI specific flags and remove them from
+    // argc/argv.
+    MTMessenger(int &argc, char **&argv);
+
+    // tears it all down
+    ~MTMessenger();
+
+    // send a request to a server and wait (block) for the response;
+    virtual Message *sendrecv(Message *m, msg_addr_t dest);
+
+    // wait (block) for a request from anyone
+    Message *recvreq();
+
+    // forward request, masquerading as original source
+    void fwdreq(Message *req, int dest);
+
+    // send the response to the originator of the request
+    virtual void sendresp(Message *req, Message *resp);
+
+
+}; // class MTMessenger
+
+#endif // __MTMESSENGER_H
diff --git a/branches/sage/cephmds2/msg/Message.cc b/branches/sage/cephmds2/msg/Message.cc
new file mode 100644
index 0000000000000..b37c4d2cb421d
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Message.cc
@@ -0,0 +1,442 @@
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+#include "include/types.h"
+
+#include "Message.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+#include "messages/MMonElectionAck.h"
+#include "messages/MMonElectionCollect.h"
+#include "messages/MMonElectionRefresh.h"
+#include "messages/MMonElectionStatus.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MFailure.h"
+#include "messages/MFailureAck.h"
+
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MMDSGetMap.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMDSBoot.h"
+
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDirWarning.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MHashReaddir.h"
+#include "messages/MHashReaddirReply.h"
+
+#include "messages/MHashDirDiscover.h"
+#include "messages/MHashDirDiscoverAck.h"
+#include "messages/MHashDirPrep.h"
+#include "messages/MHashDirPrepAck.h"
+#include "messages/MHashDir.h"
+#include "messages/MHashDirAck.h"
+#include "messages/MHashDirNotify.h"
+
+#include "messages/MUnhashDirPrep.h"
+#include "messages/MUnhashDirPrepAck.h"
+#include "messages/MUnhashDir.h"
+#include "messages/MUnhashDirAck.h"
+#include "messages/MUnhashDirNotify.h"
+#include "messages/MUnhashDirNotifyAck.h"
+
+#include "messages/MRenameWarning.h"
+#include "messages/MRenameNotify.h"
+#include "messages/MRenameNotifyAck.h"
+#include "messages/MRename.h"
+#include "messages/MRenamePrep.h"
+#include "messages/MRenameReq.h"
+#include "messages/MRenameAck.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MHeartbeat.h"
+
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MInodeExpire.h"
+#include "messages/MDirExpire.h"
+#include "messages/MCacheExpire.h"
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MLock.h"
+
+#include "config.h"
+#undef  dout
+#define dout(l)    if (l<=g_conf.debug) cout << "messenger: "
+#define DEBUGLVL  10    // debug level of output
+
+
+
+
+
+
+
+Message *
+decode_message(msg_envelope_t& env, bufferlist& payload)
+{
+  // make message
+  Message *m = 0;
+  switch(env.type) {
+
+    // -- with payload --
+
+  case MSG_NS_CONNECT:
+    m = new MNSConnect();
+    break;
+  case MSG_NS_CONNECTACK:
+    m = new MNSConnectAck();
+    break;
+  case MSG_NS_REGISTER:
+    m = new MNSRegister();
+    break;
+  case MSG_NS_REGISTERACK:
+    m = new MNSRegisterAck();
+    break;
+  case MSG_NS_LOOKUP:
+    m = new MNSLookup();
+    break;
+  case MSG_NS_LOOKUPREPLY:
+    m = new MNSLookupReply();
+    break;
+  case MSG_NS_FAILURE:
+    m = new MNSFailure();
+    break;
+
+  case MSG_MON_ELECTION_ACK:
+    m = new MMonElectionAck();
+    break;
+  case MSG_MON_ELECTION_COLLECT:
+    m = new MMonElectionCollect();
+    break;
+  case MSG_MON_ELECTION_REFRESH:
+    m = new MMonElectionRefresh();
+    break;
+  case MSG_MON_ELECTION_STATUS:
+    m = new MMonElectionStatus();
+    break;
+
+  case MSG_PING:
+    m = new MPing();
+    break;
+  case MSG_PING_ACK:
+    m = new MPingAck();
+    break;
+  case MSG_FAILURE:
+    m = new MFailure();
+    break;
+  case MSG_FAILURE_ACK:
+    m = new MFailureAck();
+    break;
+
+  case MSG_OSD_BOOT:
+    m = new MOSDBoot();
+    break;
+  case MSG_OSD_IN:
+    m = new MOSDIn();
+    break;
+  case MSG_OSD_OUT:
+    m = new MOSDOut();
+    break;
+  case MSG_OSD_FAILURE:
+    m = new MOSDFailure();
+    break;
+  case MSG_OSD_PING:
+    m = new MOSDPing();
+    break;
+  case MSG_OSD_OP:
+    m = new MOSDOp();
+    break;
+  case MSG_OSD_OPREPLY:
+    m = new MOSDOpReply();
+    break;
+
+  case MSG_OSD_MAP:
+    m = new MOSDMap();
+    break;
+  case MSG_OSD_GETMAP:
+    m = new MOSDGetMap();
+    break;
+
+  case MSG_OSD_PG_NOTIFY:
+    m = new MOSDPGNotify();
+    break;
+  case MSG_OSD_PG_QUERY:
+    m = new MOSDPGQuery();
+    break;
+  case MSG_OSD_PG_LOG:
+    m = new MOSDPGLog();
+    break;
+  case MSG_OSD_PG_REMOVE:
+    m = new MOSDPGRemove();
+    break;
+
+    // clients
+  case MSG_CLIENT_MOUNT:
+    m = new MClientMount();
+    break;
+  case MSG_CLIENT_MOUNTACK:
+    m = new MClientMountAck();
+    break;
+  case MSG_CLIENT_REQUEST:
+    m = new MClientRequest();
+    break;
+  case MSG_CLIENT_REPLY:
+    m = new MClientReply();
+    break;
+  case MSG_CLIENT_FILECAPS:
+    m = new MClientFileCaps();
+    break;
+
+    // mds
+  case MSG_MDS_GETMAP:
+	m = new MMDSGetMap();
+	break;
+  case MSG_MDS_MAP:
+	m = new MMDSMap();
+	break;
+  case MSG_MDS_BOOT:
+	m = new MMDSBoot();
+	break;
+
+  case MSG_MDS_DIRUPDATE:
+    m = new MDirUpdate();
+    break;
+
+  case MSG_MDS_DISCOVER:
+    m = new MDiscover();
+    break;
+  case MSG_MDS_DISCOVERREPLY:
+    m = new MDiscoverReply();
+    break;
+
+  case MSG_MDS_EXPORTDIRDISCOVER:
+    m = new MExportDirDiscover();
+    break;
+  case MSG_MDS_EXPORTDIRDISCOVERACK:
+    m = new MExportDirDiscoverAck();
+    break;
+
+  case MSG_MDS_EXPORTDIR:
+    m = new MExportDir();
+    break;
+
+  case MSG_MDS_EXPORTDIRFINISH:
+    m = new MExportDirFinish();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFY:
+    m = new MExportDirNotify();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFYACK:
+    m = new MExportDirNotifyAck();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREP:
+    m = new MExportDirPrep();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREPACK:
+    m = new MExportDirPrepAck();
+    break;
+
+  case MSG_MDS_EXPORTDIRWARNING:
+    m = new MExportDirWarning();
+    break;
+
+
+  case MSG_MDS_HASHREADDIR:
+    m = new MHashReaddir();
+    break;
+  case MSG_MDS_HASHREADDIRREPLY:
+    m = new MHashReaddirReply();
+    break;
+    
+  case MSG_MDS_HASHDIRDISCOVER:
+    m = new MHashDirDiscover();
+    break;
+  case MSG_MDS_HASHDIRDISCOVERACK:
+    m = new MHashDirDiscoverAck();
+    break;
+  case MSG_MDS_HASHDIRPREP:
+    m = new MHashDirPrep();
+    break;
+  case MSG_MDS_HASHDIRPREPACK:
+    m = new MHashDirPrepAck();
+    break;
+  case MSG_MDS_HASHDIR:
+    m = new MHashDir();
+    break;
+  case MSG_MDS_HASHDIRACK:
+    m = new MHashDirAck();
+    break;
+  case MSG_MDS_HASHDIRNOTIFY:
+    m = new MHashDirNotify();
+    break;
+
+  case MSG_MDS_UNHASHDIRPREP:
+    m = new MUnhashDirPrep();
+    break;
+  case MSG_MDS_UNHASHDIRPREPACK:
+    m = new MUnhashDirPrepAck();
+    break;
+  case MSG_MDS_UNHASHDIR:
+    m = new MUnhashDir();
+    break;
+  case MSG_MDS_UNHASHDIRACK:
+    m = new MUnhashDirAck();
+    break;
+  case MSG_MDS_UNHASHDIRNOTIFY:
+    m = new MUnhashDirNotify();
+    break;
+  case MSG_MDS_UNHASHDIRNOTIFYACK:
+    m = new MUnhashDirNotifyAck();
+    break;
+
+  case MSG_MDS_RENAMEWARNING:
+    m = new MRenameWarning();
+    break;
+  case MSG_MDS_RENAMENOTIFY:
+    m = new MRenameNotify();
+    break;
+  case MSG_MDS_RENAMENOTIFYACK:
+    m = new MRenameNotifyAck();
+    break;
+  case MSG_MDS_RENAME:
+    m = new MRename();
+    break;
+  case MSG_MDS_RENAMEPREP:
+    m = new MRenamePrep();
+    break;
+  case MSG_MDS_RENAMEREQ:
+    m = new MRenameReq();
+    break;
+  case MSG_MDS_RENAMEACK:
+    m = new MRenameAck();
+    break;
+
+  case MSG_MDS_DENTRYUNLINK:
+    m = new MDentryUnlink();
+    break;
+
+  case MSG_MDS_HEARTBEAT:
+    m = new MHeartbeat();
+    break;
+
+  case MSG_MDS_CACHEEXPIRE:
+    m = new MCacheExpire();
+    break;
+
+  case MSG_MDS_ANCHORREQUEST:
+    m = new MAnchorRequest();
+    break;
+  case MSG_MDS_ANCHORREPLY:
+    m = new MAnchorReply();
+    break;
+
+  case MSG_MDS_INODELINK:
+    m = new MInodeLink();
+    break;
+  case MSG_MDS_INODELINKACK:
+    m = new MInodeLinkAck();
+    break;
+
+	/*  case MSG_MDS_INODEUPDATE:
+    m = new MInodeUpdate();
+    break;
+	*/
+
+  case MSG_MDS_INODEEXPIRE:
+    m = new MInodeExpire();
+    break;
+
+  case MSG_MDS_INODEFILECAPS:
+    m = new MInodeFileCaps();
+    break;
+
+  case MSG_MDS_DIREXPIRE:
+    m = new MDirExpire();
+    break;
+
+  case MSG_MDS_LOCK:
+    m = new MLock();
+    break;
+
+
+    // -- simple messages without payload --
+
+  case MSG_CLOSE:
+  case MSG_NS_STARTED:
+  case MSG_NS_UNREGISTER:
+  case MSG_SHUTDOWN:
+  case MSG_MDS_SHUTDOWNSTART:
+  case MSG_MDS_SHUTDOWNFINISH:
+  case MSG_CLIENT_UNMOUNT:
+  case MSG_OSD_MKFS_ACK:
+    m = new MGenericMessage(env.type);
+    break;
+
+  default:
+    dout(1) << "can't decode unknown message type " << env.type << endl;
+    assert(0);
+  }
+  
+  // env
+  m->set_envelope(env);
+
+  // decode
+  m->set_payload(payload);
+  m->decode_payload();
+
+  // done!
+  return m;
+}
+
+
diff --git a/branches/sage/cephmds2/msg/Message.h b/branches/sage/cephmds2/msg/Message.h
new file mode 100644
index 0000000000000..afe1ae6941844
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Message.h
@@ -0,0 +1,463 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MESSAGE_H
+#define __MESSAGE_H
+ 
+#define MSG_CLOSE 0
+
+#define MSG_NS_CONNECT     1
+#define MSG_NS_CONNECTACK  2
+#define MSG_NS_REGISTER    3
+#define MSG_NS_REGISTERACK 4
+#define MSG_NS_STARTED     5
+#define MSG_NS_UNREGISTER  6
+#define MSG_NS_LOOKUP      7
+#define MSG_NS_LOOKUPREPLY 8
+#define MSG_NS_FAILURE     9
+
+
+#define MSG_PING        10
+#define MSG_PING_ACK    11
+
+#define MSG_FAILURE     12
+#define MSG_FAILURE_ACK 13
+
+#define MSG_SHUTDOWN    99999
+
+
+#define MSG_MON_ELECTION_ACK       15
+#define MSG_MON_ELECTION_COLLECT   16
+#define MSG_MON_ELECTION_REFRESH   17
+#define MSG_MON_ELECTION_STATUS    18
+
+#define MSG_MON_OSDMAP_INFO            20
+#define MSG_MON_OSDMAP_LEASE           21
+#define MSG_MON_OSDMAP_LEASE_ACK       22
+#define MSG_MON_OSDMAP_UPDATE_PREPARE  23
+#define MSG_MON_OSDMAP_UPDATE_ACK      24
+#define MSG_MON_OSDMAP_UPDATE_COMMIT   25
+
+#define MSG_OSD_OP           40    // delete, etc.
+#define MSG_OSD_OPREPLY      41    // delete, etc.
+#define MSG_OSD_PING         42
+
+#define MSG_OSD_GETMAP       43
+#define MSG_OSD_MAP          44
+
+#define MSG_OSD_BOOT         45
+#define MSG_OSD_MKFS_ACK     46
+
+#define MSG_OSD_FAILURE      47
+
+#define MSG_OSD_IN           48
+#define MSG_OSD_OUT          49
+
+
+
+#define MSG_OSD_PG_NOTIFY      50
+#define MSG_OSD_PG_QUERY       51
+#define MSG_OSD_PG_SUMMARY     52
+#define MSG_OSD_PG_LOG         53
+#define MSG_OSD_PG_REMOVE      54
+
+#define MSG_CLIENT_REQUEST         60
+#define MSG_CLIENT_REPLY           61
+//#define MSG_CLIENT_DONE            62
+#define MSG_CLIENT_FILECAPS        63
+#define MSG_CLIENT_INODEAUTHUPDATE 64
+
+#define MSG_CLIENT_MOUNT           70
+#define MSG_CLIENT_MOUNTACK        71
+#define MSG_CLIENT_UNMOUNT         72
+
+
+// *** MDS ***
+
+#define MSG_MDS_BOOT               100
+#define MSG_MDS_GETMAP             101
+#define MSG_MDS_MAP                102
+#define MSG_MDS_HEARTBEAT          103
+
+#define MSG_MDS_DISCOVER           110
+#define MSG_MDS_DISCOVERREPLY      111
+
+#define MSG_MDS_INODEGETREPLICA    112
+#define MSG_MDS_INODEGETREPLICAACK 113
+
+#define MSG_MDS_INODEFILECAPS      115
+
+#define MSG_MDS_INODEUPDATE  120
+#define MSG_MDS_DIRUPDATE    121
+#define MSG_MDS_INODEEXPIRE  122
+#define MSG_MDS_DIREXPIRE    123
+
+#define MSG_MDS_DIREXPIREREQ 124
+
+#define MSG_MDS_CACHEEXPIRE  125
+
+#define MSG_MDS_ANCHORREQUEST 130
+#define MSG_MDS_ANCHORREPLY   131
+
+#define MSG_MDS_INODELINK       140
+#define MSG_MDS_INODELINKACK    141
+#define MSG_MDS_INODEUNLINK     142
+#define MSG_MDS_INODEUNLINKACK  143
+
+#define MSG_MDS_EXPORTDIRDISCOVER      150
+#define MSG_MDS_EXPORTDIRDISCOVERACK   151
+#define MSG_MDS_EXPORTDIRPREP      152
+#define MSG_MDS_EXPORTDIRPREPACK   153
+#define MSG_MDS_EXPORTDIRWARNING   154
+#define MSG_MDS_EXPORTDIR          155
+#define MSG_MDS_EXPORTDIRNOTIFY    156
+#define MSG_MDS_EXPORTDIRNOTIFYACK 157
+#define MSG_MDS_EXPORTDIRFINISH    158
+
+
+#define MSG_MDS_HASHDIRDISCOVER    160
+#define MSG_MDS_HASHDIRDISCOVERACK 161
+#define MSG_MDS_HASHDIRPREP        162
+#define MSG_MDS_HASHDIRPREPACK     163
+#define MSG_MDS_HASHDIR            164
+#define MSG_MDS_HASHDIRACK         165
+#define MSG_MDS_HASHDIRNOTIFY      166
+
+#define MSG_MDS_HASHREADDIR        168
+#define MSG_MDS_HASHREADDIRREPLY   169
+
+#define MSG_MDS_UNHASHDIRPREP      170
+#define MSG_MDS_UNHASHDIRPREPACK   171
+#define MSG_MDS_UNHASHDIR          172
+#define MSG_MDS_UNHASHDIRACK       173
+#define MSG_MDS_UNHASHDIRNOTIFY    174
+#define MSG_MDS_UNHASHDIRNOTIFYACK 175
+
+#define MSG_MDS_DENTRYUNLINK      200
+
+#define MSG_MDS_RENAMEWARNING    300   // sent from src to bystanders
+#define MSG_MDS_RENAMENOTIFY     301   // sent from dest to bystanders
+#define MSG_MDS_RENAMENOTIFYACK  302   // sent back to src
+#define MSG_MDS_RENAMEACK        303   // sent from src to initiator, to xlock_finish
+
+#define MSG_MDS_RENAMEPREP       304   // sent from initiator to dest auth (if dir)
+#define MSG_MDS_RENAMEREQ        305   // sent from initiator (or dest if dir) to src auth
+#define MSG_MDS_RENAME           306   // sent from src to dest, includes inode
+
+#define MSG_MDS_LOCK             500
+
+#define MSG_MDS_SHUTDOWNSTART  900
+#define MSG_MDS_SHUTDOWNFINISH 901
+
+
+#include <stdlib.h>
+#include <cassert>
+
+#include <iostream>
+#include <list>
+using std::list;
+
+#include <ext/hash_map>
+#include <ext/rope>
+
+using __gnu_cxx::crope;
+
+#include "include/buffer.h"
+
+#include "tcp.h"
+
+
+
+
+// use fixed offsets and static entity -> logical addr mapping!
+#define MSG_ADDR_NAMER_BASE   0
+#define MSG_ADDR_RANK_BASE    0x10000000    // per-rank messenger services
+#define MSG_ADDR_MDS_BASE     0x20000000
+#define MSG_ADDR_OSD_BASE     0x30000000
+#define MSG_ADDR_MON_BASE     0x40000000
+#define MSG_ADDR_CLIENT_BASE  0x50000000
+
+#define MSG_ADDR_TYPE_MASK    0xf0000000
+#define MSG_ADDR_NUM_MASK     0x0fffffff
+
+#define MSG_ADDR_NEW          0x0fffffff
+#define MSG_ADDR_UNDEF_BASE   0xffffffff
+
+
+/* old int way, which lacked type safety...
+typedef int  msg_addr_t;
+
+#define MSG_ADDR_RANK(x)    (MSG_ADDR_RANK_BASE + (x))
+#define MSG_ADDR_MDS(x)     (MSG_ADDR_MDS_BASE + (x))
+#define MSG_ADDR_OSD(x)     (MSG_ADDR_OSD_BASE + (x))
+#define MSG_ADDR_CLIENT(x)  (MSG_ADDR_CLIENT_BASE + (x))
+
+#define MSG_ADDR_DIRECTORY   0
+#define MSG_ADDR_RANK_NEW    MSG_ADDR_RANK(MSG_ADDR_NEW)
+#define MSG_ADDR_MDS_NEW     MSG_ADDR_MDS(MSG_ADDR_NEW)
+#define MSG_ADDR_OSD_NEW     MSG_ADDR_OSD(MSG_ADDR_NEW)
+#define MSG_ADDR_CLIENT_NEW  MSG_ADDR_CLIENT(MSG_ADDR_NEW)
+
+#define MSG_ADDR_ISCLIENT(x)  ((x) >= MSG_ADDR_CLIENT_BASE)
+#define MSG_ADDR_TYPE(x)    (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_RANK_BASE ? "rank": \
+                             (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_CLIENT_BASE ? "client": \
+                              (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_OSD_BASE ? "osd": \
+                               (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_MDS_BASE ? "mds": \
+                                ((x) == MSG_ADDR_DIRECTORY ? "namer":"unknown")))))
+#define MSG_ADDR_NUM(x)    ((x) & MSG_ADDR_NUM_MASK)
+#define MSG_ADDR_NICE(x)   MSG_ADDR_TYPE(x) << MSG_ADDR_NUM(x)
+*/
+
+// new typed msg_addr_t way!
+class msg_addr_t {
+public:
+  int _addr;
+
+  msg_addr_t() : _addr(MSG_ADDR_UNDEF_BASE) {}
+  msg_addr_t(int t, int n) : _addr(t | n) {}
+  
+  int num() const { return _addr & MSG_ADDR_NUM_MASK; }
+  int type() const { return _addr & MSG_ADDR_TYPE_MASK; }
+  const char *type_str() const {
+    switch (type()) {
+    case MSG_ADDR_RANK_BASE: return "rank";
+    case MSG_ADDR_MDS_BASE: return "mds"; 
+    case MSG_ADDR_OSD_BASE: return "osd"; 
+    case MSG_ADDR_MON_BASE: return "mon"; 
+    case MSG_ADDR_CLIENT_BASE: return "client"; 
+    case MSG_ADDR_NAMER_BASE: return "namer";
+    }
+    return "unknown";
+  }
+
+  bool is_new() const { return num() == MSG_ADDR_NEW; }
+
+  bool is_client() const { return type() == MSG_ADDR_CLIENT_BASE; }
+  bool is_mds() const { return type() == MSG_ADDR_MDS_BASE; }
+  bool is_osd() const { return type() == MSG_ADDR_OSD_BASE; }
+  bool is_mon() const { return type() == MSG_ADDR_MON_BASE; }
+  bool is_namer() const { return type() == MSG_ADDR_NAMER_BASE; }
+};
+
+inline bool operator== (const msg_addr_t& l, const msg_addr_t& r) { return l._addr == r._addr; }
+inline bool operator!= (const msg_addr_t& l, const msg_addr_t& r) { return l._addr != r._addr; }
+inline bool operator< (const msg_addr_t& l, const msg_addr_t& r) { return l._addr < r._addr; }
+
+//typedef struct msg_addr msg_addr_t;
+
+inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) {
+  //if (addr.is_namer()) return out << "namer";
+  return out << addr.type_str() << addr.num();
+}
+
+
+namespace __gnu_cxx {
+  template<> struct hash< msg_addr_t >
+  {
+    size_t operator()( const msg_addr_t m ) const
+    {
+      static hash<int> H;
+      return H(m._addr);
+    }
+  };
+}
+
+#define MSG_ADDR_RANK(x)    msg_addr_t(MSG_ADDR_RANK_BASE,x)
+#define MSG_ADDR_MDS(x)     msg_addr_t(MSG_ADDR_MDS_BASE,x)
+#define MSG_ADDR_OSD(x)     msg_addr_t(MSG_ADDR_OSD_BASE,x)
+#define MSG_ADDR_MON(x)     msg_addr_t(MSG_ADDR_MON_BASE,x)
+#define MSG_ADDR_CLIENT(x)  msg_addr_t(MSG_ADDR_CLIENT_BASE,x)
+#define MSG_ADDR_NAMER(x)   msg_addr_t(MSG_ADDR_NAMER_BASE,x)
+
+#define MSG_ADDR_UNDEF       msg_addr_t()
+#define MSG_ADDR_DIRECTORY   MSG_ADDR_NAMER(0)
+
+#define MSG_ADDR_RANK_NEW    MSG_ADDR_RANK(MSG_ADDR_NEW)
+#define MSG_ADDR_MDS_NEW     MSG_ADDR_MDS(MSG_ADDR_NEW)
+#define MSG_ADDR_OSD_NEW     MSG_ADDR_OSD(MSG_ADDR_NEW)
+#define MSG_ADDR_CLIENT_NEW  MSG_ADDR_CLIENT(MSG_ADDR_NEW)
+#define MSG_ADDR_NAMER_NEW   MSG_ADDR_NAMER(MSG_ADDR_NEW)
+
+#define MSG_ADDR_ISCLIENT(x)  x.is_client()
+#define MSG_ADDR_TYPE(x)      x.type_str()
+#define MSG_ADDR_NUM(x)       x.num()
+#define MSG_ADDR_NICE(x)      x.type_str() << x.num()
+
+
+
+
+class entity_inst_t {
+ public:
+  tcpaddr_t addr;
+  int       rank;
+
+  entity_inst_t() : rank(-1) {
+    memset(&addr, 0, sizeof(addr));
+  }
+  entity_inst_t(tcpaddr_t& a, int r) : addr(a), rank(r) {
+    memset(&addr, 0, sizeof(addr));
+  }
+};
+
+inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return a.rank == b.rank && a.addr == b.addr; }
+inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return !(a == b); }
+inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return a.rank > b.rank; }
+inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank >= b.rank; }
+inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return a.rank < b.rank; }
+inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank <= b.rank; }
+
+inline ostream& operator<<(ostream& out, const entity_inst_t &i)
+{
+  return out << "rank" << i.rank << "_" << i.addr;
+}
+
+
+// abstract Message class
+
+
+
+typedef struct {
+  int type;
+  msg_addr_t source, dest;
+  entity_inst_t source_inst;
+  int source_port, dest_port;
+  int nchunks;
+  __uint64_t lamport_send_stamp;
+  __uint64_t lamport_recv_stamp;
+} msg_envelope_t;
+
+#define MSG_ENVELOPE_LEN  sizeof(msg_envelope_t)
+
+
+class Message {
+ private:
+  
+ protected:
+  msg_envelope_t  env;    // envelope
+  bufferlist      payload;        // payload
+  
+  friend class Messenger;
+public:
+
+ public:
+  Message() { 
+    env.source_port = env.dest_port = -1;
+    env.source = env.dest = MSG_ADDR_UNDEF;
+    env.nchunks = 0;
+    env.lamport_send_stamp = 0;    
+    env.lamport_recv_stamp = 0;
+  };
+  Message(int t) {
+    env.source_port = env.dest_port = -1;
+    env.source = env.dest = MSG_ADDR_UNDEF;
+    env.nchunks = 0;
+    env.type = t;
+    env.lamport_send_stamp = 0;
+    env.lamport_recv_stamp = 0;
+  }
+  virtual ~Message() {
+  }
+
+  void set_lamport_send_stamp(__uint64_t t) { env.lamport_send_stamp = t; }
+  void set_lamport_recv_stamp(__uint64_t t) { env.lamport_recv_stamp = t; }
+  __uint64_t get_lamport_send_stamp() { return env.lamport_send_stamp; }
+  __uint64_t get_lamport_recv_stamp() { return env.lamport_recv_stamp; }
+
+
+  // for rpc-type procedural messages (pcid = procedure call id)
+  virtual long get_pcid() { return 0; }
+  virtual void set_pcid(long t) { assert(0); }  // overload me
+
+  void clear_payload() { payload.clear(); }
+  bool empty_payload() { return payload.length() == 0; }
+  bufferlist& get_payload() {
+    return payload;
+  }
+  void set_payload(bufferlist& bl) {
+    payload.claim(bl);
+  }
+  msg_envelope_t& get_envelope() {
+    return env;
+  }
+  void set_envelope(msg_envelope_t& env) {
+    this->env = env;
+  }
+
+
+  // ENVELOPE ----
+
+  // type
+  int get_type() { return env.type; }
+  void set_type(int t) { env.type = t; }
+  virtual char *get_type_name() = 0;
+
+  // source/dest
+  msg_addr_t& get_dest() { return env.dest; }
+  void set_dest(msg_addr_t a, int p) { env.dest = a; env.dest_port = p; }
+  int get_dest_port() { return env.dest_port; }
+  
+  msg_addr_t& get_source() { return env.source; }
+  void set_source(msg_addr_t a, int p) { env.source = a; env.source_port = p; }
+  int get_source_port() { return env.source_port; }
+
+  entity_inst_t& get_source_inst() { return env.source_inst; }
+  void set_source_inst(entity_inst_t &i) { env.source_inst = i; }
+
+  // PAYLOAD ----
+  void reset_payload() {
+    payload.clear();
+  }
+
+  // overload either the rope version (easier!)
+  virtual void encode_payload(crope& s)           { assert(0); }
+  virtual void decode_payload(crope& s, int& off) { assert(0); }
+ 
+  // of the bufferlist versions (faster!)
+  virtual void decode_payload() {
+    // use a crope for convenience, small messages, etc.  FIXME someday.
+    crope ser;
+    for (list<bufferptr>::const_iterator it = payload.buffers().begin();
+         it != payload.buffers().end();
+         it++)
+      ser.append((*it).c_str(), (*it).length());
+    
+    int off = 0;
+    decode_payload(ser, off);
+    assert((unsigned)off == payload.length());
+  }
+  virtual void encode_payload() {
+    assert(payload.length() == 0);  // caller should reset payload
+
+    // use crope for convenience, small messages. FIXME someday.
+    crope r;
+    encode_payload(r);
+
+    // copy payload
+    payload.push_back( buffer::copy(r.c_str(), r.length()) );
+  }
+
+  virtual void print(ostream& out) {
+    out << "message(type=" << get_type() << ")";
+  }
+  
+};
+
+extern Message *decode_message(msg_envelope_t &env, bufferlist& bl);
+inline ostream& operator<<(ostream& out, Message& m) {
+  m.print(out);
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/msg/Messenger.cc b/branches/sage/cephmds2/msg/Messenger.cc
new file mode 100644
index 0000000000000..b033bbfc08638
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Messenger.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <ext/rope>
+#include "include/types.h"
+
+#include "Message.h"
+#include "Messenger.h"
+#include "messages/MGenericMessage.h"
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << "messenger: "
+#define DEBUGLVL  10    // debug level of output
+
+
+
+// -------- 
+// callbacks
+
+Mutex                msgr_callback_lock;
+list<Context*>       msgr_callback_queue;
+//Context*             msgr_callback_kicker = 0;
+
+void Messenger::queue_callback(Context *c) {
+  msgr_callback_lock.Lock();
+  msgr_callback_queue.push_back(c);
+  msgr_callback_lock.Unlock();
+
+  callback_kick();
+}
+void Messenger::queue_callbacks(list<Context*>& ls) {
+  msgr_callback_lock.Lock();
+  msgr_callback_queue.splice(msgr_callback_queue.end(), ls);
+  msgr_callback_lock.Unlock();
+
+  callback_kick();
+}
+
+void Messenger::do_callbacks() {
+  // take list
+  msgr_callback_lock.Lock();
+  list<Context*> ls;
+  ls.splice(ls.begin(), msgr_callback_queue);
+  msgr_callback_lock.Unlock();
+
+  // do them
+  for (list<Context*>::iterator it = ls.begin();
+       it != ls.end();
+       it++) {
+    dout(10) << "--- doing callback " << *it << endl;
+    (*it)->finish(0);
+    delete *it;
+  }
+}
+
+// ---------
+// incoming messages
+
+void Messenger::dispatch(Message *m) 
+{
+  assert(dispatcher);
+  dispatcher->dispatch(m);
+}
+
+
+
diff --git a/branches/sage/cephmds2/msg/Messenger.h b/branches/sage/cephmds2/msg/Messenger.h
new file mode 100644
index 0000000000000..4ec3349a2a096
--- /dev/null
+++ b/branches/sage/cephmds2/msg/Messenger.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef __MESSENGER_H
+#define __MESSENGER_H
+
+#include <map>
+using namespace std;
+
+#include "Message.h"
+#include "Dispatcher.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "include/Context.h"
+
+
+typedef __uint64_t lamport_t;
+
+
+class MDS;
+class Timer;
+
+class Messenger {
+ private:
+  Dispatcher          *dispatcher;
+  msg_addr_t           _myaddr;
+
+
+ public:
+  Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w) { }
+  virtual ~Messenger() { }
+  
+  void       set_myaddr(msg_addr_t m) { _myaddr = m; }
+  msg_addr_t get_myaddr() { return _myaddr; }
+
+
+  virtual int shutdown() = 0;
+  
+  // callbacks
+  static void do_callbacks();
+
+  void queue_callback(Context *c);
+  void queue_callbacks(list<Context*>& ls);
+  virtual void callback_kick() = 0;
+
+  virtual int get_dispatch_queue_len() { return 0; };
+
+  // setup
+  void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); }
+  Dispatcher *get_dispatcher() { return dispatcher; }
+  virtual void ready() { }
+  bool is_ready() { return dispatcher != 0; }
+
+  // dispatch incoming messages
+  virtual void dispatch(Message *m);
+
+  // send message
+  virtual void prepare_dest(const entity_inst_t& inst) {}
+  virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0;
+  virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+			   int port=0, int fromport=0) {
+    return send_message(m, dest, port, fromport);   // overload me!
+  }
+
+
+  // make a procedure call
+  //virtual Message* sendrecv(Message *m, msg_addr_t dest, int port=0);
+
+
+  virtual void mark_down(msg_addr_t a, entity_inst_t& i) {}
+  virtual void mark_up(msg_addr_t a, entity_inst_t& i) {}
+  //virtual void reset(msg_addr_t a) { mark_down(a); mark_up(a); }
+
+};
+
+
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/msg/NewMessenger.cc b/branches/sage/cephmds2/msg/NewMessenger.cc
new file mode 100644
index 0000000000000..6cd5d291b60c3
--- /dev/null
+++ b/branches/sage/cephmds2/msg/NewMessenger.cc
@@ -0,0 +1,1714 @@
+
+#include "NewMessenger.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+//#include "messages/MFailure.h"
+
+#include <netdb.h>
+
+
+#undef dout
+#define dout(l)  if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " "
+#define derr(l)  if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " "
+
+
+
+#include "tcp.cc"
+
+
+Rank rank;
+
+
+/********************************************
+ * Namer
+ */
+
+Rank::Namer::Namer(EntityMessenger *msgr) :
+  messenger(msgr),
+  nrank(0), nclient(0), nmds(0), nosd(0), nmon(0)
+{
+  assert(rank.my_rank == 0);
+  nrank = g_conf.num_mon;
+  
+  // announce myself
+  /*
+  cerr << "ceph ns is " << rank.accepter.listen_addr << endl;
+  cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl;
+  int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+  ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t));
+  ::fchmod(fd, 0755);
+  ::close(fd);
+  */
+
+  // ok
+  messenger->set_dispatcher(this);
+}
+
+Rank::Namer::~Namer()
+{
+  //::unlink(".ceph_ns");
+}
+
+
+void Rank::Namer::dispatch(Message *m)
+{
+  rank.lock.Lock();
+  int type = m->get_type();
+  switch (type) {
+  case MSG_NS_CONNECT:
+    handle_connect((class MNSConnect*)m);
+    break;
+  case MSG_NS_REGISTER:
+    handle_register((class MNSRegister*)m);
+    break;
+  case MSG_NS_STARTED:
+    handle_started(m);
+    break;
+  case MSG_NS_UNREGISTER:
+    handle_unregister(m);
+    break;
+  case MSG_NS_LOOKUP:
+    handle_lookup((class MNSLookup*)m);
+    break;
+  case MSG_NS_FAILURE:
+    handle_failure((class MNSFailure*)m);
+    break;
+    
+  case MSG_FAILURE_ACK:
+    delete m;
+    break;
+
+  default:
+    assert(0);
+  }
+  rank.lock.Unlock();
+}
+
+void Rank::Namer::handle_connect(MNSConnect *m)
+{
+  int newrank = nrank++;
+  dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl;
+  
+  rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr();
+  rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank;
+  rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank));
+
+  messenger->send_message(new MNSConnectAck(newrank),
+                          MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]);
+  delete m;
+}
+
+void Rank::Namer::manual_insert_inst(const entity_inst_t &inst)
+{
+  rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst;
+}
+
+void Rank::Namer::handle_register(MNSRegister *m)
+{
+  dout(10) << "namer.handle_register from rank " << m->get_rank()
+          << " addr " << m->get_entity() << endl;
+  
+  // pick id
+  msg_addr_t entity = m->get_entity();
+
+  if (entity.is_new()) {
+    // make up a new address!
+    switch (entity.type()) {
+    case MSG_ADDR_MDS_BASE:
+      entity = MSG_ADDR_MDS(nmds++);
+      break;
+      
+    case MSG_ADDR_OSD_BASE:
+      entity = MSG_ADDR_OSD(nosd++);
+      break;
+      
+    case MSG_ADDR_CLIENT_BASE:
+      entity = MSG_ADDR_CLIENT(nclient++);
+      break;
+      
+    default:
+      assert(0);
+    }
+  } else {
+    // specific address!
+  }
+
+
+  // register
+  if (rank.entity_map.count(entity)) {
+    dout(1) << "namer.handle_register re-registering " << entity
+            << " inst " << m->get_source_inst()
+            << " (was " << rank.entity_map[entity] << ")"
+            << endl;
+  } else {
+    dout(1) << "namer.handle_register registering " << entity
+            << " inst " << m->get_source_inst()
+            << endl;
+  }
+  rank.entity_map[entity] = m->get_source_inst();
+  rank.entity_unstarted.insert(entity);
+  
+  // reply w/ new id
+  messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), 
+                          m->get_source(), rank.entity_map[entity]);
+  
+  delete m;
+}
+
+void Rank::Namer::handle_started(Message *m)
+{
+  msg_addr_t who = m->get_source();
+  dout(10) << "namer.handle_started from entity " << who << endl;
+
+  assert(rank.entity_unstarted.count(who));
+  rank.entity_unstarted.erase(who);
+  
+  // anybody waiting?
+  if (waiting.count(who)) {
+    list<Message*> ls;
+    ls.swap(waiting[who]);
+    waiting.erase(who);
+    
+    dout(10) << "doing waiters on " << who << endl;
+    for (list<Message*>::iterator it = ls.begin();
+         it != ls.end();
+         it++) 
+      dispatch(*it);
+  }
+ 
+}
+
+void Rank::Namer::handle_unregister(Message *m)
+{
+  msg_addr_t who = m->get_source();
+  dout(1) << "namer.handle_unregister entity " << who << endl;
+
+  rank.show_dir();
+  
+  assert(rank.entity_map.count(who));
+  rank.entity_map.erase(who);
+
+  rank.show_dir();
+
+  // shut myself down?  kick watcher.
+  if (rank.entity_map.size() == 2) {
+    dout(10) << "namer.handle_unregister stopping namer" << endl;
+    rank.lock.Unlock();
+    messenger->shutdown();
+    delete messenger;
+    rank.lock.Lock();
+  }
+
+  delete m;
+}
+
+
+void Rank::Namer::handle_lookup(MNSLookup *m) 
+{
+  // have it?
+  if (rank.entity_map.count(m->get_entity()) == 0) {
+    dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl;
+    waiting[m->get_entity()].push_back(m);
+    return;
+  }
+
+  if (rank.entity_unstarted.count(m->get_entity())) {
+    dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl;
+    waiting[m->get_entity()].push_back(m);
+    return;
+  }
+
+  // look it up!  
+  MNSLookupReply *reply = new MNSLookupReply(m);
+
+  reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()];
+
+  dout(10) << "namer " << m->get_source()
+           << " lookup '" << m->get_entity() 
+           << "' -> " << rank.entity_map[m->get_entity()] << endl;
+  
+  messenger->send_message(reply, m->get_source(), m->get_source_inst());
+  delete m;
+}
+
+void Rank::Namer::handle_failure(MNSFailure *m)
+{
+  dout(10) << "namer.handle_failure inst " << m->get_inst()
+           << endl;
+
+  // search for entities on this instance
+  list<msg_addr_t> rm;
+  for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+       i != rank.entity_map.end();
+       i++) {
+    if (i->second != m->get_inst()) continue;
+    rm.push_back(i->first);
+  }
+  for (list<msg_addr_t>::iterator i = rm.begin();
+       i != rm.end();
+       i++) {
+    dout(10) << "namer.handle_failure inst " << m->get_inst()
+             << ", removing " << *i << endl;
+    
+    rank.entity_map.erase(*i);
+    rank.entity_unstarted.erase(*i);
+    
+    /*
+    if ((*i).is_osd()) {
+      // tell the monitor
+      messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0));
+    }
+    */
+  }
+
+  delete m;
+}
+
+
+
+/********************************************
+ * Accepter
+ */
+
+int Rank::Accepter::start()
+{
+  // bind to a socket
+  dout(10) << "accepter.start binding to listen " << endl;
+  
+  /* socket creation */
+  listen_sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(listen_sd > 0);
+  
+  /* bind to port */
+  memset((char*)&listen_addr, 0, sizeof(listen_addr));
+  listen_addr.sin_family = AF_INET;
+  listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  listen_addr.sin_port = 0;
+  
+  int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+  assert(rc >= 0);
+
+  socklen_t llen = sizeof(listen_addr);
+  getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+  
+  int myport = listen_addr.sin_port;
+
+  // listen!
+  rc = ::listen(listen_sd, 1000);
+  assert(rc >= 0);
+
+  //dout(10) << "accepter.start listening on " << myport << endl;
+  
+  // my address is...
+  char host[100];
+  bzero(host, 100);
+  gethostname(host, 100);
+  //dout(10) << "accepter.start my hostname is " << host << endl;
+
+  struct hostent *myhostname = gethostbyname( host ); 
+
+  struct sockaddr_in my_addr;  
+  memset(&my_addr, 0, sizeof(my_addr));
+
+  my_addr.sin_family = myhostname->h_addrtype;
+  memcpy((char *) &my_addr.sin_addr.s_addr, 
+         myhostname->h_addr_list[0], 
+         myhostname->h_length);
+  my_addr.sin_port = myport;
+  
+  listen_addr = my_addr;
+  
+  dout(10) << "accepter.start listen addr is " << listen_addr << endl;
+
+  // start thread
+  create();
+
+  return 0;
+}
+
+void *Rank::Accepter::entry()
+{
+  dout(10) << "accepter starting" << endl;
+
+  while (!done) {
+    // accept
+    struct sockaddr_in addr;
+    socklen_t slen = sizeof(addr);
+    int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+    if (sd > 0) {
+      dout(10) << "accepted incoming on sd " << sd << endl;
+      
+      Receiver *r = new Receiver(sd);
+      r->create();
+      
+      rank.lock.Lock();
+      rank.receivers.insert(r);
+      rank.lock.Unlock();
+    } else {
+      dout(10) << "no incoming connection?" << endl;
+      break;
+    }
+  }
+
+  return 0;
+}
+
+
+/**************************************
+ * Receiver
+ */
+
+void *Rank::Receiver::entry()
+{
+  while (!done) {
+    Message *m = read_message();
+    if (!m) {
+      ::close(sd);
+      break;
+    }
+    
+    dout(10) << "receiver.entry got message for " << m->get_dest() << endl;
+
+    EntityMessenger *entity = 0;
+
+    rank.lock.Lock();
+    {
+      if (rank.down.count(m->get_dest())) {
+        dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl;
+        delete m;
+
+        if (rank.looking_up.count(m->get_dest()) == 0) 
+          rank.lookup(m->get_dest());
+      }
+      else if (rank.entity_map.count(m->get_source()) &&
+               rank.entity_map[m->get_source()] > m->get_source_inst()) {
+        derr(0) << "receiver.entry source " << m->get_source() 
+                << " inst " << m->get_source_inst() 
+                << " < " << rank.entity_map[m->get_source()] 
+                << ", dropping " << *m << endl;
+        delete m;
+      }
+      else {
+        if (rank.entity_map.count(m->get_source()) &&
+            rank.entity_map[m->get_source()] > m->get_source_inst()) {
+          derr(0) << "receiver.entry source " << m->get_source() 
+                  << " inst " << m->get_source_inst() 
+                  << " > " << rank.entity_map[m->get_source()] 
+                  << ", WATCH OUT " << *m << endl;
+          rank.entity_map[m->get_source()] = m->get_source_inst();
+        }
+
+        if (m->get_dest().type() == MSG_ADDR_RANK_BASE) {
+          // ours.
+          rank.dispatch(m);
+        } else {
+          if (g_conf.ms_single_dispatch) {
+            // submit to single dispatch queue
+            rank._submit_single_dispatch(m);
+          } else {
+            if (rank.local.count(m->get_dest())) {
+              // find entity
+              entity = rank.local[m->get_dest()];
+            } else {
+              derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+              rank.waiting_for_lookup[m->get_dest()].push_back(m);
+            }
+          }
+        }
+      }
+    }
+    rank.lock.Unlock();
+      
+    if (entity) 
+      entity->queue_message(m);        // queue
+  }
+
+  // add to reap queue
+  rank.lock.Lock();
+  rank.receiver_reap_queue.push_back(this);
+  rank.wait_cond.Signal();
+  rank.lock.Unlock();
+  
+  return 0;
+}
+
+Message *Rank::Receiver::read_message()
+{
+  // envelope
+  //dout(10) << "receiver.read_message from sd " << sd  << endl;
+  
+  msg_envelope_t env;
+  if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+    return 0;
+  
+  if (env.type == 0) {
+    dout(10) << "receiver got dummy env, bailing" << endl;
+    return 0;
+  }
+
+  dout(20) << "receiver got envelope type=" << env.type 
+               << " src " << env.source << " dst " << env.dest
+               << " nchunks=" << env.nchunks
+               << endl;
+  
+  // payload
+  bufferlist blist;
+  for (int i=0; i<env.nchunks; i++) {
+    int size;
+    tcp_read( sd, (char*)&size, sizeof(size) );
+    
+    bufferptr bp = new buffer(size);
+    
+    if (!tcp_read( sd, bp.c_str(), size )) return 0;
+    
+    blist.push_back(bp);
+    
+    dout(20) << "receiver got frag " << i << " of " << env.nchunks 
+                 << " len " << bp.length() << endl;
+  }
+  
+  // unmarshall message
+  size_t s = blist.length();
+  Message *m = decode_message(env, blist);
+  
+  dout(20) << "receiver got " << s << " byte message from " 
+               << m->get_source() << endl;
+  
+  return m;
+}
+
+
+/**************************************
+ * Sender
+ */
+
+int Rank::Sender::connect()
+{
+  dout(10) << "sender(" << inst << ").connect" << endl;
+
+  // create socket?
+  sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(sd > 0);
+  
+  // bind any port
+  struct sockaddr_in myAddr;
+  myAddr.sin_family = AF_INET;
+  myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  myAddr.sin_port = htons( 0 );    
+  
+  int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+  assert(rc>=0);
+
+  // connect!
+  int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr));
+  if (r < 0) return r;
+
+  // identify myself
+  // FIXME
+  
+  return 0;
+}
+
+
+void Rank::Sender::finish()
+{
+  dout(10) << "sender(" << inst << ").finish" << endl;
+
+  // make sure i get reaped.
+  rank.lock.Lock();
+  rank.sender_reap_queue.push_back(this);
+  rank.wait_cond.Signal();
+  rank.lock.Unlock();
+}
+
+void Rank::Sender::fail_and_requeue(list<Message*>& out)
+{
+  dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl;
+
+  // tell namer
+  if (!rank.messenger) {
+    derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl;
+    assert(0);
+  }
+
+  // old and unnecessary?
+  if (0)
+    rank.messenger->send_message(new MNSFailure(inst),
+                                 MSG_ADDR_NAMER(0));
+
+
+  // FIXME: possible race before i reclaim lock here?
+  
+  Dispatcher *dis = 0;
+  msg_addr_t dis_dest;
+  
+  list<Message*> lost;
+
+  // requeue my messages
+  rank.lock.Lock();
+  lock.Lock();
+  {
+    // include out at front of queue
+    q.splice(q.begin(), out);  
+    dout(10) << "sender(" << inst << ").fail " 
+             << q.size() << " messages" << endl;
+    
+    if (0) {
+      lost.swap(q);
+    } else {
+
+      while (!q.empty()) {
+        // don't keep reconnecting..
+        if (rank.entity_map.count(q.front()->get_dest()) &&
+            rank.entity_map[q.front()->get_dest()] == inst)
+          rank.down.insert(q.front()->get_dest());
+        //rank.entity_map.erase(q.front()->get_dest());
+        
+        if (!dis &&
+            rank.local.count(q.front()->get_source())) {
+          dis_dest = q.front()->get_dest();
+          dis = rank.local[q.front()->get_source()]->get_dispatcher();
+        }
+        
+        if (g_conf.ms_requeue_on_sender_fail)
+          rank.submit_message( q.front() );
+        else
+          lost.push_back( q.front() );
+        q.pop_front();
+      }
+    }
+
+    // deactivate myself
+    if (rank.rank_sender.count(inst.rank) &&
+        rank.rank_sender[inst.rank] == this)
+      rank.rank_sender.erase(inst.rank);
+
+    // stop sender loop
+    done = true;
+  }
+  lock.Unlock();
+
+
+  // send special failure msg?
+  if (dis) {
+    for (list<Message*>::iterator p = lost.begin();
+         p != lost.end();
+         p++)
+      dis->ms_handle_failure(*p, dis_dest, inst);
+  }
+  
+  rank.lock.Unlock();
+}
+
+void *Rank::Sender::entry()
+{
+  // connect
+  if (sd == 0) {
+	int rc = connect();
+	if (rc < 0) {
+	  list<Message*> out;
+	  derr(0) << "error connecting to " << inst << endl;
+	  fail_and_requeue(out);
+	  finish();
+	  return 0;
+	}
+  }
+
+  lock.Lock();
+  while (!q.empty() || !done) {
+    
+    if (!q.empty()) {
+      dout(20) << "sender(" << inst << ") grabbing message(s)" << endl;
+      
+      // grab outgoing list
+      list<Message*> out;
+      out.swap(q);
+      
+      // drop lock while i send these
+      lock.Unlock();
+      
+      while (!out.empty()) {
+        Message *m = out.front();
+        out.pop_front();
+
+        dout(20) << "sender(" << inst << ") sending " << *m << endl;
+
+        // stamp.
+        m->set_source_inst(rank.my_inst);
+        
+        // marshall
+        if (m->empty_payload())
+          m->encode_payload();
+        
+        if (write_message(m) < 0) {
+          // failed!
+          derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl;
+          out.push_front(m);
+          fail_and_requeue(out);
+          break;
+        }
+      }
+
+      lock.Lock();
+      continue;
+    }
+    
+    // wait
+    dout(20) << "sender(" << inst << ") sleeping" << endl;
+    cond.Wait(lock);
+  }
+  lock.Unlock(); 
+  
+  finish();
+  return 0;
+}
+
+
+int Rank::Sender::write_message(Message *m)
+{
+  // get envelope, buffers
+  msg_envelope_t *env = &m->get_envelope();
+  bufferlist blist;
+  blist.claim( m->get_payload() );
+  
+#ifdef TCP_KEEP_CHUNKS
+  env->nchunks = blist.buffers().size();
+#else
+  env->nchunks = 1;
+#endif
+
+  dout(20)// << g_clock.now() 
+            << " sending " << m << " " << *m 
+            << " to " << m->get_dest()
+            << endl;
+  
+  // send envelope
+  int r = tcp_write( sd, (char*)env, sizeof(*env) );
+  if (r < 0) { 
+    derr(20) << "error sending envelope for " << *m
+             << " to " << m->get_dest() << endl; 
+    return -1;
+  }
+
+  // payload
+#ifdef TCP_KEEP_CHUNKS
+  // send chunk-wise
+  int i = 0;
+  for (list<bufferptr>::iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl;
+    int size = (*it).length();
+    r = tcp_write( sd, (char*)&size, sizeof(size) );
+    if (r < 0) { 
+      derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl; 
+      return -1;
+    }
+    r = tcp_write( sd, (*it).c_str(), size );
+    if (r < 0) { 
+      derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl; 
+      return -1;
+    }
+    i++;
+  }
+#else
+  // one big chunk
+  int size = blist.length();
+  r = tcp_write( sd, (char*)&size, sizeof(size) );
+  if (r < 0) { 
+    derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl; 
+    return -1;
+  }
+  for (list<bufferptr>::iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    r = tcp_write( sd, (*it).c_str(), (*it).length() );
+    if (r < 0) { 
+      derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; 
+      return -1;
+    }
+  }
+#endif
+  
+  // delete message
+  delete m;
+  return 0;
+}
+
+
+
+/********************************************
+ * Rank
+ */
+
+Rank::Rank(int r) : 
+  single_dispatcher(this),
+  my_rank(r),
+  namer(0) {
+}
+Rank::~Rank()
+{
+  //FIXME
+  if (namer) delete namer;
+}
+
+
+void Rank::_submit_single_dispatch(Message *m)
+{
+  assert(lock.is_locked());
+
+  if (local.count(m->get_dest()) &&
+      local[m->get_dest()]->is_ready()) {
+    rank.single_dispatch_queue.push_back(m);
+    rank.single_dispatch_cond.Signal();
+  } else {
+    waiting_for_ready[m->get_dest()].push_back(m);
+  }
+}
+
+
+void Rank::single_dispatcher_entry()
+{
+  lock.Lock();
+  while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
+    if (!single_dispatch_queue.empty()) {
+      list<Message*> ls;
+      ls.swap(single_dispatch_queue);
+
+      lock.Unlock();
+      {
+        while (!ls.empty()) {
+          Message *m = ls.front();
+          ls.pop_front();
+          
+          dout(1) //<< g_clock.now() 
+                  << "---- " 
+                  << m->get_source() << ':' << m->get_source_port() 
+                  << " to " << m->get_dest() << ':' << m->get_dest_port()
+                  << " ---- " << m->get_type_name() 
+                  << " ---- " << m 
+                  << endl;
+          
+          if (m->get_dest().type() == MSG_ADDR_RANK_BASE)
+            rank.dispatch(m);
+          else {
+            assert(local.count(m->get_dest()));
+            local[m->get_dest()]->dispatch(m);
+          }
+        }
+      }
+      lock.Lock();
+      continue;
+    }
+    single_dispatch_cond.Wait(lock);
+  }
+  lock.Unlock();
+}
+
+
+/*
+ * note: assumes lock is held
+ */
+void Rank::reaper()
+{
+  assert(lock.is_locked());
+
+  while (!receiver_reap_queue.empty()) {
+    Receiver *r = receiver_reap_queue.front();
+    receiver_reap_queue.pop_front();
+    //dout(10) << "reaper reaping receiver sd " << r->sd << endl;
+    receivers.erase(r);
+    r->join();
+    dout(10) << "reaper reaped receiver sd " << r->sd << endl;
+    delete r;
+  }
+
+  while (!sender_reap_queue.empty()) {
+    Sender *s = sender_reap_queue.front();
+    sender_reap_queue.pop_front();
+    //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl;
+    if (rank_sender.count(s->inst.rank) &&
+        rank_sender[s->inst.rank] == s)
+      rank_sender.erase(s->inst.rank);
+    s->join();
+    dout(10) << "reaper reaped sender " << s->inst << endl;
+    delete s;
+  }
+}
+
+
+int Rank::start_rank()
+{
+  dout(10) << "start_rank" << endl;
+
+  // bind to a socket
+  if (accepter.start() < 0) 
+    return -1;
+
+  // start single thread dispatcher?
+  if (g_conf.ms_single_dispatch) {
+    single_dispatch_stop = false;
+    single_dispatcher.create();
+  }    
+
+  lock.Lock();
+
+  if (my_rank < 0) {
+    dout(10) << "start_rank connecting to namer0" << endl;
+    
+    // connect to namer
+    assert(entity_map.count(MSG_ADDR_NAMER(0)));
+    Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]);
+    
+    // send
+    Message *m = new MNSConnect(accepter.listen_addr);
+    m->set_dest(MSG_ADDR_NAMER(0), 0);
+    sender->send(m);
+    
+    // wait
+    while (my_rank < 0) 
+      waiting_for_rank.Wait(lock);
+    assert(my_rank >= 0);    
+    
+    dout(10) << "start_rank got rank " << my_rank << endl;
+    
+    // create rank entity
+    entity_map[MSG_ADDR_RANK(my_rank)] = my_inst;
+    local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank));
+    messenger->set_dispatcher(this);
+  } else {
+    // my_inst
+    my_inst.addr = accepter.listen_addr;
+    my_inst.rank = my_rank;
+
+    // create my rank
+    msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+    entity_map[raddr] = my_inst;
+    entity_unstarted.insert(raddr);
+    local[raddr] = messenger = new EntityMessenger(raddr);
+    messenger->set_dispatcher(this);
+    
+    dout(1) << "start_rank " << my_rank << " at " << my_inst << endl;
+  } 
+
+  lock.Unlock();
+  return 0;
+}
+
+void Rank::start_namer()
+{
+  // create namer0
+  msg_addr_t naddr = MSG_ADDR_NAMER(0);
+  entity_map[naddr] = my_inst;
+  local[naddr] = new EntityMessenger(naddr);
+  namer = new Namer(local[naddr]);
+}
+
+void Rank::set_namer(const tcpaddr_t& ns)
+{
+  entity_map[MSG_ADDR_NAMER(0)].addr = ns;
+  entity_map[MSG_ADDR_NAMER(0)].rank = 0;
+}
+
+/* connect_rank
+ * NOTE: assumes rank.lock held.
+ */
+Rank::Sender *Rank::connect_rank(const entity_inst_t& inst)
+{
+  assert(rank.lock.is_locked());
+  assert(inst != rank.my_inst);
+  
+  dout(10) << "connect_rank to " << inst << endl;
+  
+  // create sender
+  Sender *sender = new Sender(inst);
+  //int rc = sender->connect();
+  //assert(rc >= 0);
+
+  // start thread.
+  sender->create();
+
+  // old sender?
+  assert(rank.rank_sender.count(inst.rank) == 0);
+  //if (rank.rank_sender.count(r))
+  //rank.rank_sender[r]->stop();  
+
+  // ok!
+  rank.rank_sender[inst.rank] = sender;
+  return sender;
+}
+
+
+
+
+
+void Rank::show_dir()
+{
+  dout(10) << "show_dir ---" << endl;
+  
+  for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+       i != entity_map.end();
+       i++) {
+    if (local.count(i->first)) {
+      dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
+    } else {
+      dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
+    }
+  }
+}
+
+
+/* lookup
+ * NOTE: assumes directory.lock held
+ */
+void Rank::lookup(msg_addr_t addr)
+{
+  dout(10) << "lookup " << addr << endl;
+  assert(lock.is_locked());
+
+  assert(looking_up.count(addr) == 0);
+  looking_up.insert(addr);
+
+  MNSLookup *r = new MNSLookup(addr);
+  messenger->send_message(r, MSG_ADDR_DIRECTORY);
+}
+
+
+
+/* register_entity 
+ */
+Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+{
+  dout(10) << "register_entity " << addr << endl;
+  lock.Lock();
+  
+  // register with namer
+  static long reg_attempt = 0;
+  long id = ++reg_attempt;
+  
+  Message *reg = new MNSRegister(addr, my_rank, id);
+  reg->set_source(MSG_ADDR_RANK(my_rank), 0);
+  reg->set_source_inst(my_inst);
+  reg->set_dest(MSG_ADDR_DIRECTORY, 0);
+  
+  // prepare cond
+  Cond cond;
+  waiting_for_register_cond[id] = &cond;
+  
+  // send request
+  lock.Unlock();
+  submit_message(reg);
+  lock.Lock();
+  
+  // wait
+  while (!waiting_for_register_result.count(id))
+    cond.Wait(lock);
+  
+  // grab result
+  addr = waiting_for_register_result[id];
+  dout(10) << "register_entity got " << addr << endl;
+  
+  // clean up
+  waiting_for_register_cond.erase(id);
+  waiting_for_register_result.erase(id);
+  
+  // create messenger
+  EntityMessenger *msgr = new EntityMessenger(addr);
+
+  // add to directory
+  entity_map[addr] = my_inst;
+  local[addr] = msgr;
+
+  // was anyone waiting?
+  if (waiting_for_lookup.count(addr)) {
+    submit_messages(waiting_for_lookup[addr]);
+    waiting_for_lookup.erase(addr);
+  }
+
+  lock.Unlock();
+  return msgr;
+}
+
+void Rank::unregister_entity(EntityMessenger *msgr)
+{
+  lock.Lock();
+  dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+  
+  // remove from local directory.
+  assert(local.count(msgr->get_myaddr()));
+  local.erase(msgr->get_myaddr());
+
+  if (my_rank > 0) {
+    assert(entity_map.count(msgr->get_myaddr()));
+    entity_map.erase(msgr->get_myaddr());
+  } // else namer will do it.
+
+  // tell namer.
+  if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) &&
+      msgr->get_myaddr() != MSG_ADDR_RANK(0))
+    msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+                       MSG_ADDR_NAMER(0));
+  
+  // kick wait()?
+  if (local.size() <= 2)
+    wait_cond.Signal();   
+
+  lock.Unlock();
+}
+
+
+void Rank::submit_messages(list<Message*>& ls)
+{
+  for (list<Message*>::iterator i = ls.begin(); i != ls.end(); i++)
+    submit_message(*i);
+  ls.clear();
+}
+
+
+void Rank::prepare_dest(msg_addr_t dest)
+{
+  lock.Lock();
+
+  if (entity_map.count( dest )) {
+    // remote, known rank addr.
+    entity_inst_t inst = entity_map[dest];
+    
+    if (inst == my_inst) {
+      //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+      //waiting_for_lookup[dest].push_back(m);
+    }
+    else if (rank_sender.count( inst.rank ) &&
+             rank_sender[inst.rank]->inst == inst) {
+      //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+      // connected.
+      //sender = rank_sender[ inst.rank ];
+    } else {
+      //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+      // not connected.
+      connect_rank( inst );
+    }
+  } else {
+    // unknown dest rank or rank addr.
+    if (looking_up.count(dest) == 0) {
+      //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+      lookup(dest);
+    } else {
+      //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+    }
+  }
+
+  lock.Unlock();
+}
+
+void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+{
+  const msg_addr_t dest = m->get_dest();
+
+  // lookup
+  EntityMessenger *entity = 0;
+  Sender *sender = 0;
+
+  lock.Lock();
+  {
+    // local?
+    if (dest_inst.rank == my_inst.rank) {
+      if (local.count(dest)) {
+        // local
+        dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+        if (g_conf.ms_single_dispatch) {
+          _submit_single_dispatch(m);
+        } else {
+          entity = local[dest];
+        }
+      } else {
+        // mid-register
+        dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+        assert(0);
+        waiting_for_lookup[dest].push_back(m);
+      }
+    }
+    else {
+      // remote.
+      if (rank_sender.count( dest_inst.rank )) {
+        //&&
+        //rank_sender[dest_inst.rank]->inst == dest_inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl;
+        // connected.
+        sender = rank_sender[ dest_inst.rank ];
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+        // not connected.
+        sender = connect_rank( dest_inst );
+      }
+    }
+  }
+  lock.Unlock();
+  
+  // do it
+  if (entity) {  
+    // local!
+    dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+    entity->queue_message(m);
+  } 
+  else if (sender) {
+    // remote!
+    dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+    sender->send(m);
+  } 
+}
+
+
+void Rank::submit_message(Message *m)
+{
+  const msg_addr_t dest = m->get_dest();
+
+  // lookup
+  EntityMessenger *entity = 0;
+  Sender *sender = 0;
+
+  lock.Lock();
+  {
+    if (down.count(dest)) {
+      // black hole.
+      dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl;
+      delete m;
+
+      if (looking_up.count(dest) == 0) 
+        lookup(dest);
+
+    } else if (local.count(dest)) {
+      dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+
+      // local
+      if (g_conf.ms_single_dispatch) {
+        _submit_single_dispatch(m);
+      } else {
+        entity = local[dest];
+      }
+    } else if (entity_map.count( dest )) {
+      // remote, known rank addr.
+      entity_inst_t inst = entity_map[dest];
+
+      if (inst == my_inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+        waiting_for_lookup[dest].push_back(m);
+      }
+      else if (rank_sender.count( inst.rank ) &&
+          rank_sender[inst.rank]->inst == inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+        // connected.
+        sender = rank_sender[ inst.rank ];
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+        // not connected.
+        sender = connect_rank( inst );
+      }
+    } else {
+      // unknown dest rank or rank addr.
+      if (looking_up.count(dest) == 0) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+        lookup(dest);
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+      }
+      waiting_for_lookup[dest].push_back(m);
+    }
+  }
+  lock.Unlock();
+  
+  // do it
+  if (entity) {  
+    // local!
+    dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+    entity->queue_message(m);
+  } 
+  else if (sender) {
+    // remote!
+    dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+    sender->send(m);
+  } 
+}
+
+
+
+
+void Rank::dispatch(Message *m) 
+{
+  lock.Lock();
+
+  dout(10) << "dispatching " << *m << endl;
+
+  switch (m->get_type()) {
+  case MSG_NS_CONNECTACK:
+    handle_connect_ack((MNSConnectAck*)m);
+    break;
+    
+  case MSG_NS_REGISTERACK:
+    handle_register_ack((MNSRegisterAck*)m);
+    break;
+    
+  case MSG_NS_LOOKUPREPLY:
+    handle_lookup_reply((MNSLookupReply*)m);
+    break;
+    
+  default:
+    assert(0);
+  }
+  
+  lock.Unlock();
+}
+
+void Rank::handle_connect_ack(MNSConnectAck *m) 
+{
+  dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl;
+  my_rank = m->get_rank();
+
+  my_inst.addr = accepter.listen_addr;
+  my_inst.rank = my_rank;
+
+  waiting_for_rank.SignalAll();
+  delete m;
+
+  // logger!
+  /*dout(10) << "logger" << endl;
+  char names[100];
+  sprintf(names, "rank%d", my_rank);
+  string name = names;
+  
+  if (g_conf.tcp_log) {
+    logger = new Logger(name, (LogType*)&rank_logtype);
+    rank_logtype.add_set("num");
+    rank_logtype.add_inc("in");
+    rank_logtype.add_inc("inb");
+    rank_logtype.add_inc("dis");
+    rank_logtype.add_set("inq");
+    rank_logtype.add_set("inqb");
+    rank_logtype.add_set("outq");
+    rank_logtype.add_set("outqb");
+  }
+  */
+}
+
+
+void Rank::handle_register_ack(MNSRegisterAck *m) 
+{
+  dout(10) << "handle_register_ack " << m->get_entity() << endl;
+
+  long tid = m->get_tid();
+  waiting_for_register_result[tid] = m->get_entity();
+  waiting_for_register_cond[tid]->Signal();
+  delete m;
+}
+
+void Rank::handle_lookup_reply(MNSLookupReply *m) 
+{
+  list<Message*> waiting;
+  dout(10) << "got lookup reply" << endl;
+  
+  for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+       it != m->entity_map.end();
+       it++) {
+    dout(10) << "lookup got " << it->first << " at " << it->second << endl;
+    msg_addr_t addr = it->first;
+    entity_inst_t inst = it->second;
+
+    if (down.count(addr)) {
+      // ignore
+      dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl;
+      //assert(entity_map.count(addr) == 0);
+      continue;
+    }
+
+    if (entity_map.count(addr) &&
+        entity_map[addr] > inst) {
+      dout(10) << "ignoring lookup results for " << addr << ", " \
+               << entity_map[addr] << " > " << inst << endl;
+      continue;
+    }
+
+    // update map.
+    entity_map[addr] = inst;
+
+    if (inst.rank == my_rank) {
+      // local
+      dout(10) << "delivering lookup results locally" << endl;
+      if (local.count(addr)) {
+        if (g_conf.ms_single_dispatch) {
+          single_dispatch_queue.splice(single_dispatch_queue.end(),
+                                       waiting_for_lookup[addr]);
+        } else {
+          local[addr]->queue_messages(waiting_for_lookup[addr]);
+        }
+        waiting_for_lookup.erase(addr);
+      } else
+        lookup(addr);  // try again!
+
+    } else {
+      // remote
+      if (rank_sender.count(inst.rank) == 0) 
+        connect_rank(inst);
+      else if (rank_sender[inst.rank]->inst != inst) {
+        dout(0) << "lookup got rank addr change, WATCH OUT" << endl;
+        // FIXME BUG possible message loss weirdness?
+        rank_sender[inst.rank]->stop();
+        rank_sender.erase(inst.rank);
+        connect_rank(inst);
+      }
+      
+      // take waiters
+      Sender *sender = rank_sender[inst.rank];
+      assert(sender);
+      
+      if (waiting_for_lookup.count(addr)) {
+        sender->send(waiting_for_lookup[addr]);
+        waiting_for_lookup.erase(addr);
+      }
+    }
+  }
+
+  delete m;
+}
+
+
+void Rank::wait()
+{
+  lock.Lock();
+  while (1) {
+    // reap dead senders, receivers.
+    reaper();
+
+    if (local.size() == 0) {
+      dout(10) << "wait: everything stopped" << endl;
+      break;   // everything stopped.
+    }
+
+    if (local.size() == 1 &&
+        !messenger->is_stopped()) {
+      dout(10) << "wait: stopping rank" << endl;
+      lock.Unlock();
+      messenger->shutdown();
+      delete messenger;
+      lock.Lock();
+      continue;
+    }
+
+    wait_cond.Wait(lock);
+  }
+  lock.Unlock();
+
+  // done!  clean up.
+
+  // stop dispatch thread
+  if (g_conf.ms_single_dispatch) {
+    dout(10) << "wait: stopping dispatch thread" << endl;
+    lock.Lock();
+    single_dispatch_stop = true;
+    single_dispatch_cond.Signal();
+    lock.Unlock();
+    single_dispatcher.join();
+  }
+
+  // reap senders and receivers
+  lock.Lock();
+  {
+    dout(10) << "wait: stopping senders" << endl;
+    for (hash_map<int,Sender*>::iterator i = rank_sender.begin();
+         i != rank_sender.end();
+         i++)
+      i->second->stop();
+    while (!rank_sender.empty()) {
+      wait_cond.Wait(lock);
+      reaper();
+    }
+
+    if (0) {  // stop() no worky on receivers!  we leak, but who cares.
+      dout(10) << "wait: stopping receivers" << endl;
+      for (set<Receiver*>::iterator i = receivers.begin();
+           i != receivers.end();
+           i++) 
+        (*i)->stop();
+      while (!receivers.empty()) {
+        wait_cond.Wait(lock);
+        reaper();
+      }      
+    }
+
+  }
+  lock.Unlock();
+
+  dout(10) << "wait: done." << endl;
+}
+
+
+
+int Rank::find_ns_addr(tcpaddr_t &nsa)
+{
+  // file?
+  int fd = ::open(".ceph_ns",O_RDONLY);
+  if (fd > 0) {
+    ::read(fd, (void*)&nsa, sizeof(nsa));
+    ::close(fd);
+    cout << "ceph ns is " << nsa << endl;
+    return 0;
+  }
+
+  // env var?
+  char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");    
+  if (nsaddr) {
+    while (nsaddr[0] != '=') nsaddr++;
+    nsaddr++;
+    
+    if (tcp_hostlookup(nsaddr, nsa) < 0) {
+      cout << "can't resolve " << nsaddr << endl;
+      return -1;
+    }
+
+    cout << "ceph ns is " << nsa << endl;
+    return 0;
+  }
+
+  cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl;
+  return -1;
+}
+
+
+
+/**********************************
+ * EntityMessenger
+ */
+
+Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+  Messenger(myaddr),
+  stop(false),
+  dispatch_thread(this)
+{
+}
+Rank::EntityMessenger::~EntityMessenger()
+{
+}
+
+void Rank::EntityMessenger::dispatch_entry()
+{
+  lock.Lock();
+  while (!stop) {
+    if (!dispatch_queue.empty()) {
+      list<Message*> ls;
+      ls.swap(dispatch_queue);
+
+      lock.Unlock();
+      {
+        // deliver
+        while (!ls.empty()) {
+          Message *m = ls.front();
+          ls.pop_front();
+          dout(1) //<< g_clock.now()
+                  << "---- " 
+                  << m->get_source() << ':' << m->get_source_port() 
+                  << " to " << m->get_dest() << ':' << m->get_dest_port()
+                  << " ---- " << m->get_type_name() 
+                  << " ---- " << m->get_source_inst()
+                  << " ---- " << m 
+                  << endl;
+          dispatch(m);
+        }
+      }
+      lock.Lock();
+      continue;
+    }
+    cond.Wait(lock);
+  }
+  lock.Unlock();
+}
+
+void Rank::EntityMessenger::ready()
+{
+  dout(10) << "ready " << get_myaddr() << endl;
+
+  if (g_conf.ms_single_dispatch) {
+    rank.lock.Lock();
+    if (rank.waiting_for_ready.count(get_myaddr())) {
+      rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
+                                        rank.waiting_for_ready[get_myaddr()]);
+      rank.waiting_for_ready.erase(get_myaddr());
+      rank.single_dispatch_cond.Signal();
+    }
+    rank.lock.Unlock();
+  } else {
+    // start my dispatch thread
+    dispatch_thread.create();
+  }
+
+  // tell namer
+  if (get_myaddr() != MSG_ADDR_NAMER(0))
+    send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0));
+}
+
+
+int Rank::EntityMessenger::shutdown()
+{
+  dout(10) << "shutdown " << get_myaddr() << endl;
+
+  // deregister
+  rank.unregister_entity(this);
+
+  // stop my dispatch thread
+  if (dispatch_thread.am_self()) {
+    dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
+    stop = true;
+  } else {
+    dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
+    lock.Lock();
+    stop = true;
+    cond.Signal();
+    lock.Unlock();
+    dispatch_thread.join();
+  }
+
+  return 0;
+}
+
+
+void Rank::EntityMessenger::prepare_send_message(msg_addr_t dest)
+{
+  rank.prepare_dest(dest);
+}
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+  // set envelope
+  m->set_source(get_myaddr(), 0);
+  m->set_dest(dest, 0);
+
+  m->set_source_inst(rank.my_inst);
+
+  dout(1) << "--> " 
+          << m->get_source() //<< ':' << m->get_source_port() 
+          << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+          << " ---- " << m->get_type_name() 
+          << " ---- " << rank.my_inst << " --> " << inst
+          << " ---- " << m 
+          << endl;
+
+  rank.submit_message(m, inst);
+
+  return 0;
+}
+
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+  // set envelope
+  m->set_source(get_myaddr(), fromport);
+  m->set_dest(dest, port);
+
+  m->set_source_inst(rank.my_inst);
+
+  dout(1) << "--> " 
+          << m->get_source() //<< ':' << m->get_source_port() 
+          << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+          << " ---- " << m->get_type_name() 
+          << " ---- " << rank.my_inst << " --> ?"
+          << " ---- " << m 
+          << endl;
+
+  rank.submit_message(m);
+
+  return 0;
+}
+
+
+void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+{
+  assert(a != get_myaddr());
+  rank.mark_down(a,i);
+}
+
+void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+{
+  if (my_rank == 0) return;   // ugh.. rank0 already handles this stuff in the namer
+  lock.Lock();
+  if (down.count(a) == 0) {
+    if (entity_map.count(a) &&
+        entity_map[a] > inst) {
+      dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+      derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+      // do nothing!
+    } else {
+      down.insert(a);
+
+      if (entity_map.count(a) == 0) {
+        // don't know it
+        dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+        derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+
+        waiting_for_lookup.erase(a);
+        looking_up.erase(a);
+      } else {
+        // know it
+        assert(entity_map[a] <= inst);
+        dout(10) << "mark_down " << a << " inst " << inst << endl;
+        derr(10) << "mark_down " << a << " inst " << inst << endl;
+        
+        entity_map.erase(a);
+        
+        if (rank_sender.count(inst.rank)) {
+          rank_sender[inst.rank]->stop();
+          rank_sender.erase(inst.rank);
+        }
+      }
+    }
+  }
+  lock.Unlock();
+}
+
+void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+  assert(a != get_myaddr());
+  rank.mark_up(a, i);
+}
+
+void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+  if (my_rank == 0) return;
+  lock.Lock();
+  {
+    dout(10) << "mark_up " << a << " inst " << i << endl;
+    derr(10) << "mark_up " << a << " inst " << i << endl;
+
+    down.erase(a);
+
+    assert(i.rank != my_rank);     // hrm?
+    
+    if (entity_map.count(a) == 0 ||
+        entity_map[a] < i) {
+      entity_map[a] = i;
+      connect_rank(i);
+    } else if (entity_map[a] == i) {
+      dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+      derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+    } else {
+      dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+      derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+    }
+
+    //if (waiting_for_lookup.count(a))
+    //lookup(a);
+  }
+  lock.Unlock();
+}
+
diff --git a/branches/sage/cephmds2/msg/NewMessenger.h b/branches/sage/cephmds2/msg/NewMessenger.h
new file mode 100644
index 0000000000000..a1c7af6e5c83b
--- /dev/null
+++ b/branches/sage/cephmds2/msg/NewMessenger.h
@@ -0,0 +1,305 @@
+#ifndef __NEWMESSENGER_H
+#define __NEWMESSENGER_H
+
+
+#include <list>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "include/types.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Messenger.h"
+#include "Message.h"
+#include "tcp.h"
+
+
+
+
+/* Rank - per-process
+ */
+class Rank : public Dispatcher {
+ 
+  class EntityMessenger;
+  class Sender;
+  class Receiver;
+
+  // namer
+  class Namer : public Dispatcher {
+  public:
+    EntityMessenger *messenger;  // namerN
+
+    int nrank;
+    int nclient, nmds, nosd, nmon;
+    
+    map<msg_addr_t, list<Message*> > waiting;
+
+    Namer(EntityMessenger *msgr);
+    ~Namer();
+
+    void handle_connect(class MNSConnect*);
+    void handle_register(class MNSRegister *m);
+    void handle_started(Message *m);
+    void handle_lookup(class MNSLookup *m);
+    void handle_unregister(Message *m);
+    void handle_failure(class MNSFailure *m);
+
+    void dispatch(Message *m); 
+
+    void manual_insert_inst(const entity_inst_t &inst);
+
+  };
+
+  // incoming
+  class Accepter : public Thread {
+  public:
+    bool done;
+
+    tcpaddr_t listen_addr;
+    int       listen_sd;
+    
+    Accepter() : done(false) {}
+    
+    void *entry();
+    void stop() {
+      done = true;
+      ::close(listen_sd);
+      join();
+    }
+    int start();
+  } accepter;
+  
+
+  class Receiver : public Thread {
+  public:
+    int sd;
+    bool done;
+
+    Receiver(int _sd) : sd(_sd), done(false) {}
+    
+    void *entry();
+    void stop() {
+      done = true;
+      ::close(sd);
+      //join();
+    }
+    Message *read_message();
+  };
+
+
+  // outgoing
+  class Sender : public Thread {
+  public:
+    entity_inst_t inst;
+    bool done;
+    int sd;
+
+    set<msg_addr_t> entities;
+    list<Message*> q;
+
+    Mutex lock;
+    Cond cond;
+    
+    Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {}
+    virtual ~Sender() {}
+    
+    void *entry();
+
+    int connect();
+    void fail_and_requeue(list<Message*>& ls);
+    void finish();
+
+    void stop() {
+      lock.Lock();
+      done = true;
+      cond.Signal();
+      lock.Unlock();
+    }
+    
+    void send(Message *m) {
+      lock.Lock();
+      q.push_back(m);
+      cond.Signal();
+      lock.Unlock();
+    }    
+    void send(list<Message*>& ls) {
+      lock.Lock();
+      q.splice(q.end(), ls);
+      cond.Signal();
+      lock.Unlock();
+    }
+
+    int write_message(Message *m);
+  };
+
+
+
+  // messenger interface
+  class EntityMessenger : public Messenger {
+    Mutex lock;
+    Cond cond;
+    list<Message*> dispatch_queue;
+    bool stop;
+
+    class DispatchThread : public Thread {
+      EntityMessenger *m;
+    public:
+      DispatchThread(EntityMessenger *_m) : m(_m) {}
+      void *entry() {
+        m->dispatch_entry();
+        return 0;
+      }
+    } dispatch_thread;
+    void dispatch_entry();
+
+  public:
+    void queue_message(Message *m) {
+      lock.Lock();
+      dispatch_queue.push_back(m);
+      cond.Signal();
+      lock.Unlock();
+    }
+    void queue_messages(list<Message*> ls) {
+      lock.Lock();
+      dispatch_queue.splice(dispatch_queue.end(), ls);
+      cond.Signal();
+      lock.Unlock();
+    }
+
+  public:
+    EntityMessenger(msg_addr_t myaddr);
+    ~EntityMessenger();
+
+    void ready();
+    bool is_stopped() { return stop; }
+
+    void wait() {
+      dispatch_thread.join();
+    }
+    
+    virtual void callback_kick() {} 
+    virtual int shutdown();
+    virtual void prepare_send_message(msg_addr_t dest);
+    virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+    virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+
+    virtual void mark_down(msg_addr_t a, entity_inst_t& i);
+    virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+    //virtual void reset(msg_addr_t a);
+  };
+
+
+  class SingleDispatcher : public Thread {
+    Rank *rank;
+  public:
+    SingleDispatcher(Rank *r) : rank(r) {}
+    void *entry() {
+      rank->single_dispatcher_entry();
+      return 0;
+    }
+  } single_dispatcher;
+
+  Cond            single_dispatch_cond;
+  bool            single_dispatch_stop;
+  list<Message*>  single_dispatch_queue;
+
+  map<msg_addr_t, list<Message*> > waiting_for_ready;
+
+  void single_dispatcher_entry();
+  void _submit_single_dispatch(Message *m);
+
+
+  // Rank stuff
+ public:
+  Mutex lock;
+  Cond  wait_cond;  // for wait()
+  
+  // my rank
+  int   my_rank;
+  Cond  waiting_for_rank;
+
+  // my instance
+  entity_inst_t my_inst;
+  
+  // lookup
+  hash_map<msg_addr_t, entity_inst_t> entity_map;
+  hash_set<msg_addr_t>                entity_unstarted;
+  
+  map<msg_addr_t, list<Message*> > waiting_for_lookup;
+  set<msg_addr_t>                  looking_up;
+
+  hash_set<msg_addr_t>            down;
+  
+  // register
+  map<int, Cond* >        waiting_for_register_cond;
+  map<int, msg_addr_t >   waiting_for_register_result;
+  
+  // local
+  map<msg_addr_t, EntityMessenger*> local;
+  
+  // remote
+  hash_map<int, Sender*> rank_sender;
+  
+  set<Receiver*>    receivers;   
+
+  list<Sender*>     sender_reap_queue;
+  list<Receiver*>   receiver_reap_queue;
+    
+  EntityMessenger *messenger;   // rankN
+  Namer           *namer;
+
+
+  void show_dir();
+
+  void lookup(msg_addr_t addr);
+  
+  void dispatch(Message *m);
+  void handle_connect_ack(class MNSConnectAck *m);
+  void handle_register_ack(class MNSRegisterAck *m);
+  void handle_lookup_reply(class MNSLookupReply *m);
+  
+  Sender *connect_rank(const entity_inst_t& inst);
+
+  void mark_down(msg_addr_t addr, entity_inst_t& i);
+  void mark_up(msg_addr_t addr, entity_inst_t& i);
+
+  tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
+
+  void reaper();
+
+
+public:
+  Rank(int r=-1);
+  ~Rank();
+
+  int find_ns_addr(tcpaddr_t &tcpaddr);
+
+  void set_namer(const tcpaddr_t& ns);
+  void start_namer();
+
+  int start_rank();
+  void wait();
+
+  EntityMessenger *register_entity(msg_addr_t addr);
+  void unregister_entity(EntityMessenger *ms);
+
+  void submit_message(Message *m, const entity_inst_t& inst);  
+  void prepare_dest(msg_addr_t dest);
+  void submit_message(Message *m);  
+  void submit_messages(list<Message*>& ls);  
+
+  // create a new messenger
+  EntityMessenger *new_entity(msg_addr_t addr);
+
+} ;
+
+extern Rank rank;
+
+#endif
diff --git a/branches/sage/cephmds2/msg/NewerMessenger.cc b/branches/sage/cephmds2/msg/NewerMessenger.cc
new file mode 100644
index 0000000000000..d1ed3fb00fdb3
--- /dev/null
+++ b/branches/sage/cephmds2/msg/NewerMessenger.cc
@@ -0,0 +1,1791 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "NewerMessenger.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+//#include "messages/MFailure.h"
+
+#include <netdb.h>
+
+
+#undef dout
+#define dout(l)  if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " "
+#define derr(l)  if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " "
+
+
+
+#include "tcp.cc"
+
+
+Rank rank;
+
+
+/********************************************
+ * Namer
+ */
+
+Rank::Namer::Namer(EntityMessenger *msgr) :
+  messenger(msgr),
+  nrank(0), nclient(0), nmds(0), nosd(0), nmon(0)
+{
+  assert(rank.my_rank == 0);
+  nrank = g_conf.num_mon;
+  
+  // announce myself
+  /*
+  cerr << "ceph ns is " << rank.accepter.listen_addr << endl;
+  cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl;
+  int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+  ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t));
+  ::fchmod(fd, 0755);
+  ::close(fd);
+  */
+
+  // ok
+  messenger->set_dispatcher(this);
+}
+
+Rank::Namer::~Namer()
+{
+  //::unlink(".ceph_ns");
+}
+
+
+void Rank::Namer::dispatch(Message *m)
+{
+  rank.lock.Lock();
+  int type = m->get_type();
+  switch (type) {
+  case MSG_NS_CONNECT:
+    handle_connect((class MNSConnect*)m);
+    break;
+  case MSG_NS_REGISTER:
+    handle_register((class MNSRegister*)m);
+    break;
+  case MSG_NS_STARTED:
+    handle_started(m);
+    break;
+  case MSG_NS_UNREGISTER:
+    handle_unregister(m);
+    break;
+  case MSG_NS_LOOKUP:
+    handle_lookup((class MNSLookup*)m);
+    break;
+  case MSG_NS_FAILURE:
+    handle_failure((class MNSFailure*)m);
+    break;
+    
+  case MSG_FAILURE_ACK:
+    delete m;
+    break;
+
+  default:
+    assert(0);
+  }
+  rank.lock.Unlock();
+}
+
+void Rank::Namer::handle_connect(MNSConnect *m)
+{
+  int newrank = nrank++;
+  dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl;
+  
+  rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr();
+  rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank;
+  rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank));
+
+  messenger->send_message(new MNSConnectAck(newrank),
+                          MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]);
+  delete m;
+}
+
+void Rank::Namer::manual_insert_inst(const entity_inst_t &inst)
+{
+  rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst;
+}
+
+void Rank::Namer::handle_register(MNSRegister *m)
+{
+  dout(10) << "namer.handle_register from rank " << m->get_rank()
+          << " addr " << m->get_entity() << endl;
+  
+  // pick id
+  msg_addr_t entity = m->get_entity();
+
+  if (entity.is_new()) {
+    // make up a new address!
+    switch (entity.type()) {
+    case MSG_ADDR_MDS_BASE:
+      entity = MSG_ADDR_MDS(nmds++);
+      break;
+      
+    case MSG_ADDR_OSD_BASE:
+      entity = MSG_ADDR_OSD(nosd++);
+      break;
+      
+    case MSG_ADDR_CLIENT_BASE:
+      entity = MSG_ADDR_CLIENT(nclient++);
+      break;
+      
+    default:
+      assert(0);
+    }
+  } else {
+    // specific address!
+  }
+
+
+  // register
+  if (rank.entity_map.count(entity)) {
+    dout(1) << "namer.handle_register re-registering " << entity
+            << " inst " << m->get_source_inst()
+            << " (was " << rank.entity_map[entity] << ")"
+            << endl;
+  } else {
+    dout(1) << "namer.handle_register registering " << entity
+            << " inst " << m->get_source_inst()
+            << endl;
+  }
+  rank.entity_map[entity] = m->get_source_inst();
+  rank.entity_unstarted.insert(entity);
+  
+  // reply w/ new id
+  messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), 
+                          m->get_source(), rank.entity_map[entity]);
+  
+  delete m;
+}
+
+void Rank::Namer::handle_started(Message *m)
+{
+  msg_addr_t who = m->get_source();
+  dout(10) << "namer.handle_started from entity " << who << endl;
+
+  assert(rank.entity_unstarted.count(who));
+  rank.entity_unstarted.erase(who);
+  
+  // anybody waiting?
+  if (waiting.count(who)) {
+    list<Message*> ls;
+    ls.swap(waiting[who]);
+    waiting.erase(who);
+    
+    dout(10) << "doing waiters on " << who << endl;
+    for (list<Message*>::iterator it = ls.begin();
+         it != ls.end();
+         it++) 
+      dispatch(*it);
+  }
+ 
+}
+
+void Rank::Namer::handle_unregister(Message *m)
+{
+  msg_addr_t who = m->get_source();
+  dout(1) << "namer.handle_unregister entity " << who << endl;
+
+  rank.show_dir();
+  
+  assert(rank.entity_map.count(who));
+  rank.entity_map.erase(who);
+
+  rank.show_dir();
+
+  // shut myself down?  kick watcher.
+  if (rank.entity_map.size() == 2) {
+    dout(10) << "namer.handle_unregister stopping namer" << endl;
+    rank.lock.Unlock();
+    messenger->shutdown();
+    delete messenger;
+    rank.lock.Lock();
+  }
+
+  delete m;
+}
+
+
+void Rank::Namer::handle_lookup(MNSLookup *m) 
+{
+  // have it?
+  if (rank.entity_map.count(m->get_entity()) == 0) {
+    dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl;
+    waiting[m->get_entity()].push_back(m);
+    return;
+  }
+
+  if (rank.entity_unstarted.count(m->get_entity())) {
+    dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl;
+    waiting[m->get_entity()].push_back(m);
+    return;
+  }
+
+  // look it up!  
+  MNSLookupReply *reply = new MNSLookupReply(m);
+
+  reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()];
+
+  dout(10) << "namer " << m->get_source()
+           << " lookup '" << m->get_entity() 
+           << "' -> " << rank.entity_map[m->get_entity()] << endl;
+  
+  messenger->send_message(reply, m->get_source(), m->get_source_inst());
+  delete m;
+}
+
+void Rank::Namer::handle_failure(MNSFailure *m)
+{
+  dout(10) << "namer.handle_failure inst " << m->get_inst()
+           << endl;
+
+  // search for entities on this instance
+  list<msg_addr_t> rm;
+  for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+       i != rank.entity_map.end();
+       i++) {
+    if (i->second != m->get_inst()) continue;
+    rm.push_back(i->first);
+  }
+  for (list<msg_addr_t>::iterator i = rm.begin();
+       i != rm.end();
+       i++) {
+    dout(10) << "namer.handle_failure inst " << m->get_inst()
+             << ", removing " << *i << endl;
+    
+    rank.entity_map.erase(*i);
+    rank.entity_unstarted.erase(*i);
+    
+    /*
+    if ((*i).is_osd()) {
+      // tell the monitor
+      messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0));
+    }
+    */
+  }
+
+  delete m;
+}
+
+
+
+/********************************************
+ * Accepter
+ */
+
+int Rank::Accepter::start()
+{
+  // bind to a socket
+  dout(10) << "accepter.start binding to listen " << endl;
+  
+  /* socket creation */
+  listen_sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(listen_sd > 0);
+  
+  /* bind to port */
+  memset((char*)&listen_addr, 0, sizeof(listen_addr));
+  listen_addr.sin_family = AF_INET;
+  listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  listen_addr.sin_port = 0;
+  
+  int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+  assert(rc >= 0);
+
+  socklen_t llen = sizeof(listen_addr);
+  getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+  
+  int myport = listen_addr.sin_port;
+
+  // listen!
+  rc = ::listen(listen_sd, 1000);
+  assert(rc >= 0);
+
+  //dout(10) << "accepter.start listening on " << myport << endl;
+  
+  // my address is...
+  char host[100];
+  bzero(host, 100);
+  gethostname(host, 100);
+  //dout(10) << "accepter.start my hostname is " << host << endl;
+
+  struct hostent *myhostname = gethostbyname( host ); 
+
+  struct sockaddr_in my_addr;  
+  memset(&my_addr, 0, sizeof(my_addr));
+
+  my_addr.sin_family = myhostname->h_addrtype;
+  memcpy((char *) &my_addr.sin_addr.s_addr, 
+         myhostname->h_addr_list[0], 
+         myhostname->h_length);
+  my_addr.sin_port = myport;
+  
+  listen_addr = my_addr;
+  
+  dout(10) << "accepter.start listen addr is " << listen_addr << endl;
+
+  // start thread
+  create();
+
+  return 0;
+}
+
+void *Rank::Accepter::entry()
+{
+  dout(10) << "accepter starting" << endl;
+
+  while (!done) {
+    // accept
+    struct sockaddr_in addr;
+    socklen_t slen = sizeof(addr);
+    int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+    if (sd > 0) {
+      dout(10) << "accepted incoming on sd " << sd << endl;
+      
+      rank.lock.Lock();
+      Pipe *p = new Pipe(sd);
+      rank.pipes.insert(p);
+      rank.lock.Unlock();
+    } else {
+      dout(10) << "no incoming connection?" << endl;
+      break;
+    }
+  }
+
+  return 0;
+}
+
+
+
+/**************************************
+ * Pipe
+ */
+
+int Rank::Pipe::accept()
+{
+  // my creater gave me sd via accept()
+  
+  // announce myself.
+  int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+  if (rc < 0) {
+    ::close(sd);
+    done = true;
+    return -1;
+  }
+  
+  // identify peer
+  rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst));
+  if (rc < 0) {
+    dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl;
+    ::close(sd);
+    done = true;
+    return -1;
+  }
+  
+  // create writer thread.
+  writer_running = true;
+  writer_thread.create();
+  
+  // register pipe.
+  if (peer_inst.rank >= 0) {
+    rank.lock.Lock();
+    {
+      if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+        // install a pipe!
+        dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl;
+        rank.rank_pipe[peer_inst.rank] = this;
+      } else {
+        // low ranks' Pipes "win"
+        if (peer_inst.rank < rank.my_inst.rank || 
+            rank.my_inst.rank < 0) {
+          dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst 
+                    << ", already had pipe, but switching to this new one" << endl;
+          // switch to this new Pipe
+          rank.rank_pipe[peer_inst.rank]->close();  // close old one
+          rank.rank_pipe[peer_inst.rank] = this;
+        } else {
+          dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst 
+                    << ", already had pipe, sticking with it" << endl;
+        }
+      }
+    }
+    rank.lock.Unlock();
+  } else {
+    dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl;
+  }
+
+  return 0;   // success.
+}
+
+int Rank::Pipe::connect()
+{
+  dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl;
+
+  // create socket?
+  sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(sd > 0);
+  
+  // bind any port
+  struct sockaddr_in myAddr;
+  myAddr.sin_family = AF_INET;
+  myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  myAddr.sin_port = htons( 0 );    
+  
+  int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+  assert(rc>=0);
+
+  // connect!
+  rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr));
+  if (rc < 0) return rc;
+
+  // identify peer
+  entity_inst_t inst;
+  rc = tcp_read(sd, (char*)&inst, sizeof(inst));
+  if (inst.rank < 0) 
+    inst = peer_inst;   // i know better than they do.
+  if (peer_inst != inst && inst.rank > 0) {
+    derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl;
+    assert(0);
+    return -1;
+  }
+
+  // identify myself
+  rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+  if (rc < 0) 
+    return -1;
+  
+  // register pipe
+  rank.lock.Lock();
+  {
+    if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+      dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl;
+      rank.rank_pipe[peer_inst.rank] = this;
+    } else {
+      // this is normal.
+      dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl;
+    }
+  }
+  rank.lock.Unlock();
+
+  // start reader
+  reader_running = true;
+  reader_thread.create();  
+  
+  return 0;
+}
+
+
+void Rank::Pipe::close()
+{
+  if (sent_close) {
+    dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl;
+    return;
+  }
+  dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl;
+
+  // unreg ourselves
+  rank.lock.Lock();
+  {
+    if (rank.rank_pipe.count(peer_inst.rank) &&
+        rank.rank_pipe[peer_inst.rank] == this) {
+      dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl;
+      rank.rank_pipe.erase(peer_inst.rank);
+    }
+  }
+  rank.lock.Unlock();
+
+  // queue close message.
+  dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl;
+  lock.Lock();
+  q.push_back(new MGenericMessage(MSG_CLOSE));
+  cond.Signal();
+  sent_close = true;
+  lock.Unlock();  
+}
+
+
+/* read msgs from socket.
+ * also, server.
+ *
+ */
+void Rank::Pipe::reader()
+{
+  if (server) 
+    accept();
+
+  // loop.
+  while (!done) {
+    Message *m = read_message();
+    if (!m || m->get_type() == 0) {
+      if (m) {
+	delete m;
+	dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl;
+      } else {
+	derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl;
+      }
+
+      if (!sent_close)
+	close();
+
+      done = true;
+      cond.Signal();  // wake up writer too.
+      break;
+    }
+
+    dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl;
+
+    EntityMessenger *entity = 0;
+
+    rank.lock.Lock();
+    {
+      if (rank.entity_map.count(m->get_source()) &&
+          rank.entity_map[m->get_source()] > m->get_source_inst()) {
+        derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() 
+                << " inst " << m->get_source_inst() 
+                << " > " << rank.entity_map[m->get_source()] 
+                << ", WATCH OUT " << *m << endl;
+        assert(0);
+      }
+
+      if (m->get_dest().type() == MSG_ADDR_RANK_BASE) {
+        // ours.
+        rank.dispatch(m);
+      } else {
+        if (g_conf.ms_single_dispatch) {
+          // submit to single dispatch queue
+          rank._submit_single_dispatch(m);
+        } else {
+          if (rank.local.count(m->get_dest())) {
+            // find entity
+            entity = rank.local[m->get_dest()];
+          } else {
+            derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+            assert(0);  // FIXME do this differently
+            //rank.waiting_for_lookup[m->get_dest()].push_back(m);
+          }
+        }
+      }
+    }
+    rank.lock.Unlock();
+    
+    if (entity) 
+      entity->queue_message(m);        // queue
+  }
+
+  
+  // reap?
+  bool reap = false;
+  lock.Lock();
+  {
+    reader_running = false;
+    if (!writer_running) reap = true;
+  }
+  lock.Unlock();
+
+  if (reap) {
+    dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl;
+    ::close(sd);
+    rank.lock.Lock();
+    {
+      rank.pipe_reap_queue.push_back(this);
+      rank.wait_cond.Signal();
+    }
+    rank.lock.Unlock();
+  }
+}
+
+
+/* write msgs to socket.
+ * also, client.
+ */
+void Rank::Pipe::writer()
+{
+  if (!server) {
+    int rc = connect();
+    if (rc < 0) {
+      derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl;
+      done = true;
+      list<Message*> out;
+      fail(out);
+    }
+  }
+
+  // loop.
+  lock.Lock();
+  while (!q.empty() || !done) {
+    
+    if (!q.empty()) {
+      dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl;
+      
+      // grab outgoing list
+      list<Message*> out;
+      out.swap(q);
+      
+      // drop lock while i send these
+      lock.Unlock();
+      
+      while (!out.empty()) {
+        Message *m = out.front();
+        out.pop_front();
+
+        dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl;
+
+        // stamp.
+        m->set_source_inst(rank.my_inst);
+        
+        // marshall
+        if (m->empty_payload())
+          m->encode_payload();
+        
+        if (write_message(m) < 0) {
+          // failed!
+          derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl;
+          out.push_front(m);
+          fail(out);
+          done = true;
+          break;
+        }
+
+        // did i just send a close?
+        if (m->get_type() == MSG_CLOSE) 
+          done = true;
+
+        // clean up
+        delete m;
+      }
+
+      lock.Lock();
+      continue;
+    }
+    
+    // wait
+    dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl;
+    cond.Wait(lock);
+  }
+  lock.Unlock(); 
+  
+  dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl;
+
+  // reap?
+  bool reap = false;
+  lock.Lock();
+  {
+    writer_running = false;
+    if (!reader_running) reap = true;
+  }
+  lock.Unlock();
+  
+  if (reap) {
+    dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl;
+    ::close(sd);
+    rank.lock.Lock();
+    {
+      rank.pipe_reap_queue.push_back(this);
+      rank.wait_cond.Signal();
+    }
+    rank.lock.Unlock();
+  }
+}
+
+
+Message *Rank::Pipe::read_message()
+{
+  // envelope
+  //dout(10) << "receiver.read_message from sd " << sd  << endl;
+  
+  msg_envelope_t env;
+  if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+    return 0;
+  
+  dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type 
+           << " src " << env.source << " dst " << env.dest
+           << " nchunks=" << env.nchunks
+           << endl;
+  
+  // payload
+  bufferlist blist;
+  for (int i=0; i<env.nchunks; i++) {
+    int size;
+    if (!tcp_read( sd, (char*)&size, sizeof(size) )) return 0;
+    
+    if (size == 0) continue;
+
+    bufferptr bp(size);
+    
+    if (!tcp_read( sd, bp.c_str(), size )) return 0;
+    
+    blist.push_back(bp);
+    
+    dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got frag " << i << " of " << env.nchunks 
+             << " len " << bp.length() << endl;
+  }
+  
+  // unmarshall message
+  size_t s = blist.length();
+  Message *m = decode_message(env, blist);
+  
+  dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got " << s << " byte message from " 
+           << m->get_source() << endl;
+  
+  return m;
+}
+
+
+
+int Rank::Pipe::write_message(Message *m)
+{
+  // get envelope, buffers
+  msg_envelope_t *env = &m->get_envelope();
+  bufferlist blist;
+  blist.claim( m->get_payload() );
+  
+#ifdef TCP_KEEP_CHUNKS
+  env->nchunks = blist.buffers().size();
+#else
+  env->nchunks = 1;
+#endif
+
+  dout(20)// << g_clock.now() 
+            << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m 
+            << " to " << m->get_dest()
+            << endl;
+  
+  // send envelope
+  int r = tcp_write( sd, (char*)env, sizeof(*env) );
+  if (r < 0) { 
+    derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m
+             << " to " << m->get_dest() << endl; 
+    return -1;
+  }
+
+  // payload
+#ifdef TCP_KEEP_CHUNKS
+  // send chunk-wise
+  int i = 0;
+  for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
+    int size = (*it).length();
+    r = tcp_write( sd, (char*)&size, sizeof(size) );
+    if (r < 0) { 
+      derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; 
+      return -1;
+    }
+    r = tcp_write( sd, (*it).c_str(), size );
+    if (r < 0) { 
+      derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; 
+      return -1;
+    }
+    i++;
+  }
+#else
+  // one big chunk
+  int size = blist.length();
+  r = tcp_write( sd, (char*)&size, sizeof(size) );
+  if (r < 0) { 
+    derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; 
+    return -1;
+  }
+  dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
+
+  for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    if ((*it).length() == 0) continue;  // blank buffer.
+    r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
+    if (r < 0) { 
+      derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; 
+      return -1;
+    }
+  }
+#endif
+  
+  return 0;
+}
+
+
+void Rank::Pipe::fail(list<Message*>& out)
+{
+  derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl;
+
+  // tell namer
+  if (!rank.messenger) {
+    derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl;
+    assert(0);
+  }
+
+  // FIXME: possible race before i reclaim lock here?
+  
+  // deactivate myself
+  rank.lock.Lock();
+  {
+    if (rank.rank_pipe.count(peer_inst.rank) &&
+        rank.rank_pipe[peer_inst.rank] == this)
+      rank.rank_pipe.erase(peer_inst.rank);
+  }
+  rank.lock.Unlock();
+
+  // what do i do about reader()?   FIXME
+
+  // sort my messages by (source) dispatcher, dest.
+  map<Dispatcher*, map<msg_addr_t, list<Message*> > > by_dis;
+  lock.Lock();
+  {
+    // include out at front of queue
+    q.splice(q.begin(), out);  
+
+    // sort
+    while (!q.empty()) {
+      if (q.front()->get_type() == MSG_CLOSE) {
+        delete q.front();
+      } 
+      else if (rank.local.count(q.front()->get_source())) {
+        Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher();
+        by_dis[dis][q.front()->get_dest()].push_back(q.front());
+      } 
+      else {
+        // oh well.  sending entity musta just shut down?
+        assert(0);
+        delete q.front();
+      }
+      q.pop_front();
+    }
+  }
+  lock.Unlock();
+
+  // report failure(s) to dispatcher(s)
+  for (map<Dispatcher*, map<msg_addr_t, list<Message*> > >::iterator i = by_dis.begin();
+       i != by_dis.end();
+       ++i) 
+    for (map<msg_addr_t, list<Message*> >::iterator j = i->second.begin();
+         j != i->second.end();
+         ++j) 
+      for (list<Message*>::iterator k = j->second.begin();
+           k != j->second.end();
+           ++k) {
+	derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl;
+        i->first->ms_handle_failure(*k, j->first, peer_inst);
+      }
+}
+
+
+
+
+
+
+/********************************************
+ * Rank
+ */
+
+Rank::Rank(int r) : 
+  single_dispatcher(this),
+  my_rank(r),
+  namer(0) {
+}
+Rank::~Rank()
+{
+  //FIXME
+  if (namer) delete namer;
+}
+
+
+void Rank::_submit_single_dispatch(Message *m)
+{
+  assert(lock.is_locked());
+
+  if (local.count(m->get_dest()) &&
+      local[m->get_dest()]->is_ready()) {
+    rank.single_dispatch_queue.push_back(m);
+    rank.single_dispatch_cond.Signal();
+  } else {
+    waiting_for_ready[m->get_dest()].push_back(m);
+  }
+}
+
+
+void Rank::single_dispatcher_entry()
+{
+  lock.Lock();
+  while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
+    if (!single_dispatch_queue.empty()) {
+      list<Message*> ls;
+      ls.swap(single_dispatch_queue);
+
+      lock.Unlock();
+      {
+        while (!ls.empty()) {
+          Message *m = ls.front();
+          ls.pop_front();
+          
+          dout(1) //<< g_clock.now() 
+                  << "---- " 
+                  << m->get_source()// << ':' << m->get_source_port() 
+                  << " to " << m->get_dest()// << ':' << m->get_dest_port()
+                  << " ---- " << m->get_type_name() 
+                  << " ---- " << m 
+                  << endl;
+          
+          if (m->get_dest().type() == MSG_ADDR_RANK_BASE)
+            rank.dispatch(m);
+          else {
+            assert(local.count(m->get_dest()));
+            local[m->get_dest()]->dispatch(m);
+          }
+        }
+      }
+      lock.Lock();
+      continue;
+    }
+    single_dispatch_cond.Wait(lock);
+  }
+  lock.Unlock();
+}
+
+
+/*
+ * note: assumes lock is held
+ */
+void Rank::reaper()
+{
+  dout(10) << "reaper" << endl;
+  assert(lock.is_locked());
+
+  while (!pipe_reap_queue.empty()) {
+    Pipe *p = pipe_reap_queue.front();
+    dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl;
+    pipe_reap_queue.pop_front();
+    assert(pipes.count(p));
+    pipes.erase(p);
+    p->join();
+    dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl;
+    delete p;
+  }
+}
+
+
+int Rank::start_rank()
+{
+  dout(10) << "start_rank" << endl;
+
+  // bind to a socket
+  if (accepter.start() < 0) 
+    return -1;
+
+  // start single thread dispatcher?
+  if (g_conf.ms_single_dispatch) {
+    single_dispatch_stop = false;
+    single_dispatcher.create();
+  }    
+
+  lock.Lock();
+
+  // my_inst
+  my_inst.addr = accepter.listen_addr;
+  my_inst.rank = my_rank;
+
+  if (my_rank < 0) {
+    dout(10) << "start_rank connecting to namer0" << endl;
+    
+    // connect to namer
+    assert(entity_map.count(MSG_ADDR_NAMER(0)));
+    Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]);
+    
+    // send
+    Message *m = new MNSConnect(accepter.listen_addr);
+    m->set_dest(MSG_ADDR_NAMER(0), 0);
+    pipe->send(m);
+    
+    // wait
+    while (my_rank < 0) 
+      waiting_for_rank.Wait(lock);
+    assert(my_rank >= 0);    
+    
+    dout(10) << "start_rank got rank " << my_rank << endl;
+    
+    // create rank entity
+    entity_map[MSG_ADDR_RANK(my_rank)] = my_inst;
+    local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank));
+    messenger->set_dispatcher(this);
+  } else {
+    // create my rank
+    msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+    entity_map[raddr] = my_inst;
+    entity_unstarted.insert(raddr);
+    local[raddr] = messenger = new EntityMessenger(raddr);
+    messenger->set_dispatcher(this);
+    
+    dout(1) << "start_rank " << my_rank << " at " << my_inst << endl;
+  } 
+
+  lock.Unlock();
+  return 0;
+}
+
+void Rank::start_namer()
+{
+  // create namer0
+  msg_addr_t naddr = MSG_ADDR_NAMER(0);
+  entity_map[naddr] = my_inst;
+  local[naddr] = new EntityMessenger(naddr);
+  namer = new Namer(local[naddr]);
+  namer_inst = my_inst;
+}
+
+void Rank::set_namer(const tcpaddr_t& ns)
+{
+  namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns;
+  namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0;
+}
+
+/* connect_rank
+ * NOTE: assumes rank.lock held.
+ */
+Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst)
+{
+  assert(rank.lock.is_locked());
+  assert(inst != rank.my_inst);
+  
+  dout(10) << "connect_rank to " << inst << endl;
+  
+  // create pipe
+  Pipe *pipe = new Pipe(inst);
+  rank.rank_pipe[inst.rank] = pipe;
+  pipes.insert(pipe);
+
+  return pipe;
+}
+
+
+
+
+
+void Rank::show_dir()
+{
+  dout(10) << "show_dir ---" << endl;
+  
+  for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+       i != entity_map.end();
+       i++) {
+    if (local.count(i->first)) {
+      dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
+    } else {
+      dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
+    }
+  }
+}
+
+
+/* lookup
+ * NOTE: assumes directory.lock held
+ */
+void Rank::lookup(msg_addr_t addr)
+{
+  dout(10) << "lookup " << addr << endl;
+  assert(lock.is_locked());
+
+  assert(looking_up.count(addr) == 0);
+  looking_up.insert(addr);
+
+  MNSLookup *r = new MNSLookup(addr);
+  messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst);
+}
+
+
+
+/* register_entity 
+ */
+Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+{
+  dout(10) << "register_entity " << addr << endl;
+  lock.Lock();
+  
+  // register with namer
+  static long reg_attempt = 0;
+  long id = ++reg_attempt;
+  
+  Message *reg = new MNSRegister(addr, my_rank, id);
+  reg->set_source(MSG_ADDR_RANK(my_rank), 0);
+  reg->set_source_inst(my_inst);
+  reg->set_dest(MSG_ADDR_DIRECTORY, 0);
+  
+  // prepare cond
+  Cond cond;
+  waiting_for_register_cond[id] = &cond;
+  
+  // send request
+  lock.Unlock();
+  submit_message(reg);
+  lock.Lock();
+  
+  // wait
+  while (!waiting_for_register_result.count(id))
+    cond.Wait(lock);
+  
+  // grab result
+  addr = waiting_for_register_result[id];
+  dout(10) << "register_entity got " << addr << endl;
+  
+  // clean up
+  waiting_for_register_cond.erase(id);
+  waiting_for_register_result.erase(id);
+  
+  // create messenger
+  EntityMessenger *msgr = new EntityMessenger(addr);
+
+  // add to directory
+  entity_map[addr] = my_inst;
+  local[addr] = msgr;
+
+  // was anyone waiting?
+  if (waiting_for_lookup.count(addr)) {
+    submit_messages(waiting_for_lookup[addr]);
+    waiting_for_lookup.erase(addr);
+  }
+
+  lock.Unlock();
+  return msgr;
+}
+
+void Rank::unregister_entity(EntityMessenger *msgr)
+{
+  lock.Lock();
+  dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+  
+  // remove from local directory.
+  assert(local.count(msgr->get_myaddr()));
+  local.erase(msgr->get_myaddr());
+
+  if (my_rank > 0) {
+    assert(entity_map.count(msgr->get_myaddr()));
+    entity_map.erase(msgr->get_myaddr());
+  } // else namer will do it.
+
+  // tell namer.
+  if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) &&
+      msgr->get_myaddr() != MSG_ADDR_RANK(0))
+    msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+                       MSG_ADDR_NAMER(0), namer_inst);
+  
+  // kick wait()?
+  if (local.size() <= 2)
+    wait_cond.Signal();   
+
+  lock.Unlock();
+}
+
+
+void Rank::submit_messages(list<Message*>& ls)
+{
+  for (list<Message*>::iterator i = ls.begin(); i != ls.end(); i++)
+    submit_message(*i);
+  ls.clear();
+}
+
+
+
+void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+{
+  const msg_addr_t dest = m->get_dest();
+
+  // lookup
+  EntityMessenger *entity = 0;
+  Pipe *pipe = 0;
+
+  lock.Lock();
+  {
+    // local?
+    if (dest_inst.rank == my_inst.rank) {
+      if (local.count(dest)) {
+        // local
+        dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+        if (g_conf.ms_single_dispatch) {
+          _submit_single_dispatch(m);
+        } else {
+          entity = local[dest];
+        }
+      } else {
+        // mid-register
+        dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl;
+        assert(0);  // hmpf
+        waiting_for_lookup[dest].push_back(m);
+      }
+    }
+    else {
+      // remote.
+      if (rank_pipe.count( dest_inst.rank )) {
+        //&&
+        //rank_pipe[dest_inst.rank]->inst == dest_inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl;
+        // connected.
+        pipe = rank_pipe[ dest_inst.rank ];
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+        // not connected.
+        pipe = connect_rank( dest_inst );
+      }
+    }
+  }
+  lock.Unlock();
+  
+  // do it
+  if (entity) {  
+    // local!
+    dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+    entity->queue_message(m);
+  } 
+  else if (pipe) {
+    // remote!
+    dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+    pipe->send(m);
+  } 
+}
+
+
+void Rank::submit_message(Message *m)
+{
+  const msg_addr_t dest = m->get_dest();
+
+  // lookup
+  EntityMessenger *entity = 0;
+  Pipe *pipe = 0;
+
+  lock.Lock();
+  {
+    if (local.count(dest)) {
+      dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+
+      // local
+      if (g_conf.ms_single_dispatch) {
+        _submit_single_dispatch(m);
+      } else {
+        entity = local[dest];
+      }
+    } else if (entity_map.count( dest )) {
+      // remote, known rank addr.
+      entity_inst_t inst = entity_map[dest];
+
+      if (inst == my_inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+        waiting_for_lookup[dest].push_back(m);
+      }
+      else if (rank_pipe.count( inst.rank ) &&
+          rank_pipe[inst.rank]->get_peer_inst() == inst) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+        // connected.
+        pipe = rank_pipe[ inst.rank ];
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+        // not connected.
+        pipe = connect_rank( inst );
+      }
+    } else {
+      // unknown dest rank or rank addr.
+      if (looking_up.count(dest) == 0) {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+        lookup(dest);
+      } else {
+        dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+      }
+      waiting_for_lookup[dest].push_back(m);
+    }
+  }
+  lock.Unlock();
+  
+  // do it
+  if (entity) {  
+    // local!
+    dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+    entity->queue_message(m);
+  } 
+  else if (pipe) {
+    // remote!
+    dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+    pipe->send(m);
+  } 
+}
+
+
+
+
+void Rank::dispatch(Message *m) 
+{
+  lock.Lock();
+
+  dout(10) << "dispatching " << *m << endl;
+
+  switch (m->get_type()) {
+  case MSG_NS_CONNECTACK:
+    handle_connect_ack((MNSConnectAck*)m);
+    break;
+    
+  case MSG_NS_REGISTERACK:
+    handle_register_ack((MNSRegisterAck*)m);
+    break;
+    
+  case MSG_NS_LOOKUPREPLY:
+    handle_lookup_reply((MNSLookupReply*)m);
+    break;
+    
+  default:
+    assert(0);
+  }
+  
+  lock.Unlock();
+}
+
+void Rank::handle_connect_ack(MNSConnectAck *m) 
+{
+  dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl;
+  my_rank = m->get_rank();
+
+  my_inst.addr = accepter.listen_addr;
+  my_inst.rank = my_rank;
+
+  waiting_for_rank.SignalAll();
+  delete m;
+
+  // logger!
+  /*dout(10) << "logger" << endl;
+  char names[100];
+  sprintf(names, "rank%d", my_rank);
+  string name = names;
+  
+  if (g_conf.tcp_log) {
+    logger = new Logger(name, (LogType*)&rank_logtype);
+    rank_logtype.add_set("num");
+    rank_logtype.add_inc("in");
+    rank_logtype.add_inc("inb");
+    rank_logtype.add_inc("dis");
+    rank_logtype.add_set("inq");
+    rank_logtype.add_set("inqb");
+    rank_logtype.add_set("outq");
+    rank_logtype.add_set("outqb");
+  }
+  */
+}
+
+
+void Rank::handle_register_ack(MNSRegisterAck *m) 
+{
+  dout(10) << "handle_register_ack " << m->get_entity() << endl;
+
+  long tid = m->get_tid();
+  waiting_for_register_result[tid] = m->get_entity();
+  waiting_for_register_cond[tid]->Signal();
+  delete m;
+}
+
+void Rank::handle_lookup_reply(MNSLookupReply *m) 
+{
+  list<Message*> waiting;
+  dout(10) << "got lookup reply" << endl;
+  
+  for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+       it != m->entity_map.end();
+       it++) {
+    dout(10) << "lookup got " << it->first << " at " << it->second << endl;
+    msg_addr_t addr = it->first;
+    entity_inst_t inst = it->second;
+
+    if (entity_map.count(addr) &&
+        entity_map[addr] > inst) {
+      dout(10) << "ignoring lookup results for " << addr << ", " \
+               << entity_map[addr] << " > " << inst << endl;
+      continue;
+    }
+
+    // update map.
+    entity_map[addr] = inst;
+
+    if (inst.rank == my_rank) {
+      // local
+      dout(10) << "delivering lookup results locally" << endl;
+      if (local.count(addr)) {
+        if (g_conf.ms_single_dispatch) {
+          single_dispatch_queue.splice(single_dispatch_queue.end(),
+                                       waiting_for_lookup[addr]);
+        } else {
+          local[addr]->queue_messages(waiting_for_lookup[addr]);
+        }
+        waiting_for_lookup.erase(addr);
+      } else
+        lookup(addr);  // try again!
+
+    } else {
+      // remote
+      if (rank_pipe.count(inst.rank) == 0) 
+        connect_rank(inst);
+      else if (rank_pipe[inst.rank]->get_peer_inst() != inst) {
+        dout(0) << "lookup got rank addr change, WATCH OUT" << endl;
+        // FIXME BUG possible message loss weirdness?
+        rank_pipe[inst.rank]->close();
+        rank_pipe.erase(inst.rank);
+        connect_rank(inst);
+      }
+      
+      // take waiters
+      Pipe *pipe = rank_pipe[inst.rank];
+      assert(pipe);
+      
+      if (waiting_for_lookup.count(addr)) {
+        pipe->send(waiting_for_lookup[addr]);
+        waiting_for_lookup.erase(addr);
+      }
+    }
+  }
+
+  delete m;
+}
+
+
+void Rank::wait()
+{
+  lock.Lock();
+  while (1) {
+    // reap dead pipes
+    reaper();
+
+    if (local.size() == 0) {
+      dout(10) << "wait: everything stopped" << endl;
+      break;   // everything stopped.
+    }
+
+    if (local.size() == 1 &&
+        !messenger->is_stopped()) {
+      dout(10) << "wait: stopping rank" << endl;
+      lock.Unlock();
+      messenger->shutdown();
+      delete messenger;
+      lock.Lock();
+      continue;
+    }
+
+    wait_cond.Wait(lock);
+  }
+  lock.Unlock();
+
+  // done!  clean up.
+
+  // stop dispatch thread
+  if (g_conf.ms_single_dispatch) {
+    dout(10) << "wait: stopping dispatch thread" << endl;
+    lock.Lock();
+    single_dispatch_stop = true;
+    single_dispatch_cond.Signal();
+    lock.Unlock();
+    single_dispatcher.join();
+  }
+  
+  // reap pipes
+  lock.Lock();
+  {
+    dout(10) << "wait: closing pipes" << endl;
+    list<Pipe*> toclose;
+    for (hash_map<int,Pipe*>::iterator i = rank_pipe.begin();
+         i != rank_pipe.end();
+         i++)
+      toclose.push_back(i->second);
+    for (list<Pipe*>::iterator i = toclose.begin();
+	 i != toclose.end();
+	 i++)
+      (*i)->close();
+
+    dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl;
+    while (!pipes.empty()) {
+      wait_cond.Wait(lock);
+      reaper();
+    }
+  }
+  lock.Unlock();
+
+  dout(10) << "wait: done." << endl;
+}
+
+
+
+int Rank::find_ns_addr(tcpaddr_t &nsa)
+{
+  // file?
+  int fd = ::open(".ceph_ns",O_RDONLY);
+  if (fd > 0) {
+    ::read(fd, (void*)&nsa, sizeof(nsa));
+    ::close(fd);
+    cout << "ceph ns is " << nsa << endl;
+    return 0;
+  }
+
+  // env var?
+  char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");    
+  if (nsaddr) {
+    while (nsaddr[0] != '=') nsaddr++;
+    nsaddr++;
+    
+    if (tcp_hostlookup(nsaddr, nsa) < 0) {
+      cout << "can't resolve " << nsaddr << endl;
+      return -1;
+    }
+
+    cout << "ceph ns is " << nsa << endl;
+    return 0;
+  }
+
+  cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl;
+  return -1;
+}
+
+
+
+/**********************************
+ * EntityMessenger
+ */
+
+Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+  Messenger(myaddr),
+  stop(false),
+  dispatch_thread(this)
+{
+}
+Rank::EntityMessenger::~EntityMessenger()
+{
+}
+
+void Rank::EntityMessenger::dispatch_entry()
+{
+  lock.Lock();
+  while (!stop) {
+    if (!dispatch_queue.empty()) {
+      list<Message*> ls;
+      ls.swap(dispatch_queue);
+
+      lock.Unlock();
+      {
+        // deliver
+        while (!ls.empty()) {
+          Message *m = ls.front();
+          ls.pop_front();
+          dout(1) //<< g_clock.now()
+                  << "---- " 
+                  << m->get_source()// << ':' << m->get_source_port() 
+                  << " to " << m->get_dest()// << ':' << m->get_dest_port()
+                  << " ---- " << m->get_type_name() 
+                  << " ---- " << m->get_source_inst()
+                  << " ---- " << m 
+                  << endl;
+          dispatch(m);
+        }
+      }
+      lock.Lock();
+      continue;
+    }
+    cond.Wait(lock);
+  }
+  lock.Unlock();
+}
+
+void Rank::EntityMessenger::ready()
+{
+  dout(10) << "ready " << get_myaddr() << endl;
+
+  if (g_conf.ms_single_dispatch) {
+    rank.lock.Lock();
+    if (rank.waiting_for_ready.count(get_myaddr())) {
+      rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
+                                        rank.waiting_for_ready[get_myaddr()]);
+      rank.waiting_for_ready.erase(get_myaddr());
+      rank.single_dispatch_cond.Signal();
+    }
+    rank.lock.Unlock();
+  } else {
+    // start my dispatch thread
+    dispatch_thread.create();
+  }
+
+  // tell namer
+  if (get_myaddr() != MSG_ADDR_NAMER(0) &&
+      get_myaddr() != MSG_ADDR_RANK(0))
+    send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst);
+}
+
+
+int Rank::EntityMessenger::shutdown()
+{
+  dout(10) << "shutdown " << get_myaddr() << endl;
+
+  // deregister
+  rank.unregister_entity(this);
+
+  // stop my dispatch thread
+  if (dispatch_thread.am_self()) {
+    dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
+    stop = true;
+  } else {
+    dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
+    lock.Lock();
+    stop = true;
+    cond.Signal();
+    lock.Unlock();
+    dispatch_thread.join();
+  }
+
+  return 0;
+}
+
+
+void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst)
+{
+  rank.lock.Lock();
+  {
+    if (rank.rank_pipe.count(inst.rank) == 0)
+      rank.connect_rank(inst);
+  }
+  rank.lock.Unlock();
+}
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+					int port, int fromport)
+{
+  // set envelope
+  m->set_source(get_myaddr(), fromport);
+  m->set_dest(dest, port);
+
+  m->set_source_inst(rank.my_inst);
+
+  dout(1) << "--> " 
+          << m->get_source() //<< ':' << m->get_source_port() 
+          << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+          << " ---- " << m->get_type_name() 
+          << " ---- " << rank.my_inst << " --> " << inst
+          << " ---- " << m 
+          << endl;
+
+  rank.submit_message(m, inst);
+
+  return 0;
+}
+
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+  // set envelope
+  m->set_source(get_myaddr(), fromport);
+  m->set_dest(dest, port);
+
+  m->set_source_inst(rank.my_inst);
+
+  dout(1) << "--> " 
+          << m->get_source() //<< ':' << m->get_source_port() 
+          << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+          << " ---- " << m->get_type_name() 
+          << " ---- " << rank.my_inst << " --> ? (DEPRECATED)"
+          << " ---- " << m 
+          << endl;
+
+  rank.submit_message(m);
+
+  return 0;
+}
+
+
+void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+{
+  assert(a != get_myaddr());
+  rank.mark_down(a,i);
+}
+
+void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+{
+  //if (my_rank == 0) return;   // ugh.. rank0 already handles this stuff in the namer
+  lock.Lock();
+  if (entity_map.count(a) &&
+      entity_map[a] > inst) {
+    dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+    derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+    // do nothing!
+  } else {
+    if (entity_map.count(a) == 0) {
+      // don't know it
+      dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+      derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+      
+      waiting_for_lookup.erase(a);
+      looking_up.erase(a);
+    } else {
+      // know it
+      assert(entity_map[a] <= inst);
+      dout(10) << "mark_down " << a << " inst " << inst << endl;
+      derr(10) << "mark_down " << a << " inst " << inst << endl;
+      
+      entity_map.erase(a);
+      
+      if (rank_pipe.count(inst.rank)) {
+	rank_pipe[inst.rank]->close();
+	rank_pipe.erase(inst.rank);
+      }
+      
+      // kill rank# too?  only if i'm the namer.
+      if (my_rank == 0) {
+	entity_map.erase(MSG_ADDR_RANK(inst.rank));
+      }
+    }
+  }
+  lock.Unlock();
+}
+
+void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+  assert(a != get_myaddr());
+  rank.mark_up(a, i);
+}
+
+void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+  if (my_rank == 0) return;
+  lock.Lock();
+  {
+    dout(10) << "mark_up " << a << " inst " << i << endl;
+    derr(10) << "mark_up " << a << " inst " << i << endl;
+
+    assert(i.rank != my_rank);     // hrm?
+    
+    if (entity_map.count(a) == 0 ||
+        entity_map[a] < i) {
+      entity_map[a] = i;
+      connect_rank(i);
+    } else if (entity_map[a] == i) {
+      dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+      derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+    } else {
+      dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+      derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+    }
+
+    //if (waiting_for_lookup.count(a))
+    //lookup(a);
+  }
+  lock.Unlock();
+}
+
diff --git a/branches/sage/cephmds2/msg/NewerMessenger.h b/branches/sage/cephmds2/msg/NewerMessenger.h
new file mode 100644
index 0000000000000..6a4e003352aa8
--- /dev/null
+++ b/branches/sage/cephmds2/msg/NewerMessenger.h
@@ -0,0 +1,343 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __NEWMESSENGER_H
+#define __NEWMESSENGER_H
+
+
+#include <list>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "include/types.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Messenger.h"
+#include "Message.h"
+#include "tcp.h"
+
+
+
+
+/* Rank - per-process
+ */
+class Rank : public Dispatcher {
+ 
+  class EntityMessenger;
+  class Pipe;
+
+  // namer
+  class Namer : public Dispatcher {
+  public:
+    EntityMessenger *messenger;  // namerN
+
+    int nrank;
+    int nclient, nmds, nosd, nmon;
+    
+    map<msg_addr_t, list<Message*> > waiting;
+
+    Namer(EntityMessenger *msgr);
+    ~Namer();
+
+    void handle_connect(class MNSConnect*);
+    void handle_register(class MNSRegister *m);
+    void handle_started(Message *m);
+    void handle_lookup(class MNSLookup *m);
+    void handle_unregister(Message *m);
+    void handle_failure(class MNSFailure *m);
+
+    void dispatch(Message *m); 
+
+    void manual_insert_inst(const entity_inst_t &inst);
+
+  };
+
+  // incoming
+  class Accepter : public Thread {
+  public:
+    bool done;
+
+    tcpaddr_t listen_addr;
+    int       listen_sd;
+    
+    Accepter() : done(false) {}
+    
+    void *entry();
+    void stop() {
+      done = true;
+      ::close(listen_sd);
+      join();
+    }
+    int start();
+  } accepter;
+  
+
+
+  class Pipe {
+  protected:
+    int sd;
+    bool done;
+    entity_inst_t peer_inst;
+    bool server;
+    bool sent_close;
+
+    bool reader_running;
+    bool writer_running;
+
+    list<Message*> q;
+    Mutex lock;
+    Cond cond;
+    
+    int accept();   // server handshake
+    int connect();  // client handshake
+    void reader();
+    void writer();
+
+    Message *read_message();
+    int write_message(Message *m);
+    void fail(list<Message*>& ls);
+
+    // threads
+    class Reader : public Thread {
+      Pipe *pipe;
+    public:
+      Reader(Pipe *p) : pipe(p) {}
+      void *entry() { pipe->reader(); return 0; }
+    } reader_thread;
+    friend class Reader;
+
+    class Writer : public Thread {
+      Pipe *pipe;
+    public:
+      Writer(Pipe *p) : pipe(p) {}
+      void *entry() { pipe->writer(); return 0; }
+    } writer_thread;
+    friend class Writer;
+
+  public:
+    Pipe(int s) : sd(s),
+      done(false), server(true), 
+      sent_close(false),
+      reader_running(false), writer_running(false),
+      reader_thread(this), writer_thread(this) {
+      // server
+      reader_running = true;
+      reader_thread.create();
+    }
+    Pipe(const entity_inst_t &pi) : sd(0),
+      done(false), peer_inst(pi), server(false), 
+      sent_close(false),
+      reader_running(false), writer_running(false),
+      reader_thread(this), writer_thread(this) {
+      // client
+      writer_running = true;
+      writer_thread.create();
+    }
+
+    // public constructors
+    static const Pipe& Server(int s);
+    static const Pipe& Client(const entity_inst_t& pi);
+
+    entity_inst_t& get_peer_inst() { return peer_inst; }
+
+    void close();
+    void join() {
+      writer_thread.join();
+      reader_thread.join();
+    }
+
+    void send(Message *m) {
+      lock.Lock();
+      q.push_back(m);
+      cond.Signal();
+      lock.Unlock();
+    }    
+    void send(list<Message*>& ls) {
+      lock.Lock();
+      q.splice(q.end(), ls);
+      cond.Signal();
+      lock.Unlock();
+    }
+  };
+
+
+
+  // messenger interface
+  class EntityMessenger : public Messenger {
+    Mutex lock;
+    Cond cond;
+    list<Message*> dispatch_queue;
+    bool stop;
+
+    class DispatchThread : public Thread {
+      EntityMessenger *m;
+    public:
+      DispatchThread(EntityMessenger *_m) : m(_m) {}
+      void *entry() {
+        m->dispatch_entry();
+        return 0;
+      }
+    } dispatch_thread;
+    void dispatch_entry();
+
+  public:
+    void queue_message(Message *m) {
+      lock.Lock();
+      dispatch_queue.push_back(m);
+      cond.Signal();
+      lock.Unlock();
+    }
+    void queue_messages(list<Message*> ls) {
+      lock.Lock();
+      dispatch_queue.splice(dispatch_queue.end(), ls);
+      cond.Signal();
+      lock.Unlock();
+    }
+
+  public:
+    EntityMessenger(msg_addr_t myaddr);
+    ~EntityMessenger();
+
+    void ready();
+    bool is_stopped() { return stop; }
+
+    void wait() {
+      dispatch_thread.join();
+    }
+    
+    virtual void callback_kick() {} 
+    virtual int shutdown();
+    virtual void prepare_dest(const entity_inst_t& inst);
+    virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+    virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+							 int port=0, int fromport=0);
+
+    virtual void mark_down(msg_addr_t a, entity_inst_t& i);
+    virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+    //virtual void reset(msg_addr_t a);
+  };
+
+
+  class SingleDispatcher : public Thread {
+    Rank *rank;
+  public:
+    SingleDispatcher(Rank *r) : rank(r) {}
+    void *entry() {
+      rank->single_dispatcher_entry();
+      return 0;
+    }
+  } single_dispatcher;
+
+  Cond            single_dispatch_cond;
+  bool            single_dispatch_stop;
+  list<Message*>  single_dispatch_queue;
+
+  map<msg_addr_t, list<Message*> > waiting_for_ready;
+
+  void single_dispatcher_entry();
+  void _submit_single_dispatch(Message *m);
+
+
+  // Rank stuff
+ public:
+  Mutex lock;
+  Cond  wait_cond;  // for wait()
+  
+  // my rank
+  int   my_rank;
+  Cond  waiting_for_rank;
+
+  // my instance
+  entity_inst_t my_inst;
+  
+  // lookup
+  hash_map<msg_addr_t, entity_inst_t> entity_map;
+  hash_set<msg_addr_t>                entity_unstarted;
+  
+  map<msg_addr_t, list<Message*> > waiting_for_lookup;
+  set<msg_addr_t>                  looking_up;
+
+  // register
+  map<int, Cond* >        waiting_for_register_cond;
+  map<int, msg_addr_t >   waiting_for_register_result;
+  
+  // local
+  map<msg_addr_t, EntityMessenger*> local;
+  
+  // remote
+  hash_map<int, Pipe*> rank_pipe;
+
+  set<Pipe*>      pipes;
+  list<Pipe*>     pipe_reap_queue;
+    
+  EntityMessenger *messenger;   // rankN
+  Namer           *namer;
+
+  entity_inst_t    namer_inst;
+
+  void show_dir();
+
+  void lookup(msg_addr_t addr);
+  
+  void dispatch(Message *m);
+  void handle_connect_ack(class MNSConnectAck *m);
+  void handle_register_ack(class MNSRegisterAck *m);
+  void handle_lookup_reply(class MNSLookupReply *m);
+  
+  Pipe *connect_rank(const entity_inst_t& inst);
+
+  void mark_down(msg_addr_t addr, entity_inst_t& i);
+  void mark_up(msg_addr_t addr, entity_inst_t& i);
+
+  tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
+
+  void reaper();
+
+
+public:
+  Rank(int r=-1);
+  ~Rank();
+
+  int find_ns_addr(tcpaddr_t &tcpaddr);
+
+  void set_namer(const tcpaddr_t& ns);
+  void start_namer();
+
+  int start_rank();
+  void wait();
+
+  EntityMessenger *register_entity(msg_addr_t addr);
+  void unregister_entity(EntityMessenger *ms);
+
+  void submit_message(Message *m, const entity_inst_t& inst);  
+  void prepare_dest(const entity_inst_t& inst);
+  void submit_message(Message *m);  
+  void submit_messages(list<Message*>& ls);  
+
+  // create a new messenger
+  EntityMessenger *new_entity(msg_addr_t addr);
+
+} ;
+
+
+
+extern Rank rank;
+
+#endif
diff --git a/branches/sage/cephmds2/msg/RWLock.h b/branches/sage/cephmds2/msg/RWLock.h
new file mode 100644
index 0000000000000..83b84c6faf370
--- /dev/null
+++ b/branches/sage/cephmds2/msg/RWLock.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef _RWLock_Posix_
+#define _RWLock_Posix_
+
+#include <pthread.h>
+
+class RWLock
+{
+  mutable pthread_rwlock_t L;
+
+  public:
+
+  RWLock() {
+    pthread_rwlock_init(&L, NULL);
+  }
+
+  virtual ~RWLock() {
+    pthread_rwlock_unlock(&L);
+    pthread_rwlock_destroy(&L);
+  }
+
+  void unlock() {
+    pthread_rwlock_unlock(&L);
+  }
+  void get_read() {
+    pthread_rwlock_rdlock(&L);    
+  }
+  void put_read() { unlock(); }
+  void get_write() {
+    pthread_rwlock_wrlock(&L);
+  }
+  void put_write() { unlock(); }
+};
+
+#endif // !_Mutex_Posix_
diff --git a/branches/sage/cephmds2/msg/SerialMessenger.h b/branches/sage/cephmds2/msg/SerialMessenger.h
new file mode 100644
index 0000000000000..d03e7377d2826
--- /dev/null
+++ b/branches/sage/cephmds2/msg/SerialMessenger.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __SERIAL_MESSENGER_H
+#define __SERIAL_MESSENGER_H
+
+#include "Dispatcher.h"
+#include "Message.h"
+
+class SerialMessenger : public Dispatcher {
+ public:
+  virtual void dispatch(Message *m) = 0;      // i receive my messages here
+  virtual void send(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0;          // doesn't block
+  virtual Message *sendrecv(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0;  // blocks for matching reply
+};
+
+#endif
diff --git a/branches/sage/cephmds2/msg/TCPDirectory.cc b/branches/sage/cephmds2/msg/TCPDirectory.cc
new file mode 100644
index 0000000000000..111f6ee69f2f3
--- /dev/null
+++ b/branches/sage/cephmds2/msg/TCPDirectory.cc
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "TCPDirectory.h"
+
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+//#include "messages/MNSUnregister.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)   if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: " 
+
+void tcp_open(int rank);
+
+
+void TCPDirectory::handle_connect(MNSConnect *m)
+{
+  int rank = nrank++;
+  dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl;
+
+  dir[MSG_ADDR_RANK(rank)] = rank;
+  messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank);
+
+  rank_addr[rank] = m->get_addr();
+  messenger->map_rank_addr(rank, m->get_addr());
+
+  messenger->send_message(new MNSConnectAck(rank),
+                          MSG_ADDR_RANK(rank));
+  delete m;
+}
+
+
+
+void TCPDirectory::handle_register(MNSRegister *m)
+{
+  dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl;
+  
+  // pick id
+  int rank = m->get_rank();
+  msg_addr_t entity = m->get_entity();
+
+  if (entity.is_new()) {
+    // make up a new address!
+    switch (entity.type()) {
+      
+    case MSG_ADDR_RANK_BASE:         // stupid client should be able to figure this out
+      entity = MSG_ADDR_RANK(rank);
+      break;
+      
+    case MSG_ADDR_MDS_BASE:
+      entity = MSG_ADDR_MDS(nmds++);
+      break;
+      
+    case MSG_ADDR_OSD_BASE:
+      entity = MSG_ADDR_OSD(nosd++);
+      break;
+      
+    case MSG_ADDR_CLIENT_BASE:
+      entity = MSG_ADDR_CLIENT(nclient++);
+      break;
+      
+    default:
+      assert(0);
+    }
+  } else {
+    // specific address!
+    assert(dir.count(entity) == 0);  // make sure it doesn't exist yet.
+  }
+
+  dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl;
+
+  // register
+  dir[entity] = rank;
+  
+  if (entity == MSG_ADDR_RANK(rank))   // map this locally now so we can reply
+    messenger->map_entity_rank(entity, rank);  // otherwise wait until they send STARTED msg
+
+  hold.insert(entity);
+
+  ++version;
+  update_log[version] = entity;
+
+  // reply w/ new id
+  messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), 
+                          MSG_ADDR_RANK(rank));
+  delete m;
+}
+
+void TCPDirectory::handle_started(Message *m)
+{
+  msg_addr_t entity = m->get_source();
+
+  dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl;
+  hold.erase(entity);
+  messenger->map_entity_rank(entity, dir[entity]);
+
+  // waiters?
+  if (waiting.count(entity)) {
+    list<Message*> ls;
+    ls.splice(ls.begin(), waiting[entity]);
+    waiting.erase(entity);
+
+    dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl;
+    for (list<Message*>::iterator it = ls.begin();
+         it != ls.end();
+         it++) {
+      dispatch(*it);
+    }
+  }
+}
+
+void TCPDirectory::handle_unregister(Message *m)
+{
+  msg_addr_t who = m->get_source();
+  dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl;
+  
+  assert(dir.count(who));
+  dir.erase(who);
+  
+  // shutdown?
+  if (dir.size() <= 2) {
+    dout(2) << "dir is empty except for me, shutting down" << endl;
+    tcpmessenger_stop_nameserver();
+  }
+  else {
+    if (0) {
+      dout(10) << "dir size now " << dir.size() << endl;
+      for (hash_map<msg_addr_t, int>::iterator it = dir.begin();
+           it != dir.end();
+           it++) {
+        dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
+      }
+    }
+  }
+
+}
+
+
+void TCPDirectory::handle_lookup(MNSLookup *m) 
+{
+  // have it?
+  if (dir.count(m->get_entity()) == 0 ||
+      hold.count(m->get_entity())) {
+    dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl;
+    waiting[m->get_entity()].push_back(m);
+    return;
+  }
+
+  // look it up!  
+  MNSLookupReply *reply = new MNSLookupReply(m);
+
+  int rank = dir[m->get_entity()];
+  reply->entity_map[m->get_entity()] = rank;
+  reply->rank_addr[rank] = rank_addr[rank];
+
+  dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl;
+
+  messenger->send_message(reply,
+                          m->get_source(), m->get_source_port());
+  delete m;
+}
diff --git a/branches/sage/cephmds2/msg/TCPDirectory.h b/branches/sage/cephmds2/msg/TCPDirectory.h
new file mode 100644
index 0000000000000..1b54bb010e906
--- /dev/null
+++ b/branches/sage/cephmds2/msg/TCPDirectory.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __TCPDIRECTORY_H
+#define __TCPDIRECTORY_H
+
+/*
+ * rank   -- a process (listening on some host:port)
+ * entity -- a logical entity (osd123, mds3, client3245, etc.)
+ *
+ * multiple entities can coexist on a single rank.
+ */
+
+#include "Dispatcher.h"
+#include "TCPMessenger.h"
+
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <sys/types.h>
+//#include <sys/stat.h>
+#include <fcntl.h>
+
+class TCPDirectory : public Dispatcher {
+ protected:
+  // how i communicate
+  TCPMessenger *messenger;
+
+  // directory
+  hash_map<msg_addr_t, int> dir;        // entity -> rank
+  hash_map<int, tcpaddr_t>  rank_addr;  // rank -> ADDR (e.g. host:port)
+  
+  __uint64_t                version;
+  map<__uint64_t, msg_addr_t>  update_log;
+  
+  int                       nrank;
+  int                       nclient, nmds, nosd;
+
+  set<msg_addr_t>           hold;
+  map<msg_addr_t, list<Message*> > waiting;
+
+  // messages
+  void handle_connect(class MNSConnect*);
+  void handle_register(class MNSRegister *m);
+  void handle_started(Message *m);
+  void handle_lookup(class MNSLookup *m);
+  void handle_unregister(Message *m);
+
+ public:
+  TCPDirectory(TCPMessenger *m) : 
+    messenger(m),
+    version(0),
+    nrank(0), nclient(0), nmds(0), nosd(0) { 
+    messenger->set_dispatcher(this);
+
+    // i am rank 0!
+    dir[MSG_ADDR_DIRECTORY] = 0;
+    rank_addr[0] = m->get_tcpaddr();
+    ++nrank;
+
+    // announce nameserver
+    cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl;
+
+    int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+    ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t));
+    ::fchmod(fd, 0755);
+    ::close(fd);
+  }
+  ~TCPDirectory() {
+    ::unlink(".ceph_ns");
+  }
+
+  void dispatch(Message *m) {
+    switch (m->get_type()) {
+    case MSG_NS_CONNECT:
+      handle_connect((class MNSConnect*)m);
+      break;
+    case MSG_NS_REGISTER:
+      handle_register((class MNSRegister*)m);
+      break;
+    case MSG_NS_STARTED:
+      handle_started(m);
+      break;
+    case MSG_NS_UNREGISTER:
+      handle_unregister(m);
+      break;
+    case MSG_NS_LOOKUP:
+      handle_lookup((class MNSLookup*)m);
+      break;
+
+    default:
+      assert(0);
+    }
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/msg/TCPMessenger.cc b/branches/sage/cephmds2/msg/TCPMessenger.cc
new file mode 100644
index 0000000000000..2c594bb528df6
--- /dev/null
+++ b/branches/sage/cephmds2/msg/TCPMessenger.cc
@@ -0,0 +1,1454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "config.h"
+#include "include/error.h"
+
+#include "common/Timer.h"
+#include "common/Mutex.h"
+
+#include "TCPMessenger.h"
+#include "Message.h"
+
+#include <iostream>
+#include <cassert>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <errno.h>
+# include <netdb.h>
+# include <sys/socket.h>
+# include <netinet/in.h>
+# include <arpa/inet.h>
+#include <sys/select.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#include <unistd.h>
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+
+#include "TCPDirectory.h"
+
+#include "common/Logger.h"
+
+#define DBL 18
+
+//#define TCP_SERIALMARSHALL  // do NOT turn this off until you check messages/* encode_payload methods
+//#define TCP_SERIALOUT       // be paranoid/annoying and send messages in same thread
+
+
+TCPMessenger *rankmessenger = 0; // 
+
+TCPDirectory *nameserver = 0;    // only defined on rank 0
+TCPMessenger *nsmessenger = 0;
+
+
+/***************************/
+LogType rank_logtype;
+Logger *logger;
+
+int stat_num = 0;
+off_t stat_inq = 0, stat_inqb = 0;
+off_t stat_disq = 0, stat_disqb = 0;
+off_t stat_outq = 0, stat_outqb = 0;
+/***************************/
+
+
+// local directory
+hash_map<msg_addr_t, TCPMessenger*>  directory;  // local
+hash_set<msg_addr_t>                 directory_ready;
+Mutex                         directory_lock;
+
+// connecting
+struct sockaddr_in listen_addr;     // my listen addr
+int                listen_sd = 0;
+int                my_rank = -1;
+Cond               waiting_for_rank;
+
+// register
+long regid = 0;
+map<int, Cond* >        waiting_for_register_cond;
+map<int, msg_addr_t >   waiting_for_register_result;
+
+// incoming messages
+list<Message*>                incoming;
+Mutex                         incoming_lock;
+Cond                          incoming_cond;
+
+// outgoing messages
+/*
+list<Message*>                outgoing;
+Mutex                         outgoing_lock;
+Cond                          outgoing_cond;
+*/
+
+class OutThread : public Thread {
+public:
+  Mutex lock;
+  Cond cond;
+  list<Message*> q;
+  bool done;
+
+  OutThread() : done(false) {}
+  virtual ~OutThread() {}
+
+  void *entry();
+  
+  void stop() {
+    lock.Lock();
+    done = true;
+    cond.Signal();
+    lock.Unlock();
+    join();
+  }
+
+  void send(Message *m) {
+    lock.Lock();
+    q.push_back(m);
+    cond.Signal();
+    lock.Unlock();
+  }
+} single_out_thread;
+
+Mutex lookup_lock;  // 
+hash_map<msg_addr_t, int> entity_rank;      // entity -> rank
+hash_map<int, int>        rank_sd;   // outgoing sockets, rank -> sd
+hash_map<int, OutThread*> rank_out;
+hash_map<int, tcpaddr_t>  rank_addr; // rank -> tcpaddr
+map<msg_addr_t, list<Message*> > waiting_for_lookup;
+
+
+/* this process */
+bool tcp_done = false;     // set this flag to stop the event loop
+
+
+// threads
+pthread_t dispatch_thread_id = 0;   // thread id of the event loop.  init value == nobody
+pthread_t out_thread_id = 0;        // thread id of the event loop.  init value == nobody
+pthread_t listen_thread_id = 0;
+map<int, pthread_t>      in_threads;    // sd -> threadid
+
+//bool pending_timer = false;
+
+// per-rank fun
+
+
+// debug
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */  << "] "
+
+
+#include "tcp.cc"
+
+// some declarations
+void tcp_open(int rank);
+int tcp_send(Message *m);
+void tcpmessenger_kick_dispatch_loop();
+OutThread *tcp_lookup(Message *m);
+
+int tcpmessenger_get_rank()
+{
+  return my_rank;
+}
+
+
+int tcpmessenger_findns(tcpaddr_t &nsa)
+{
+  char *nsaddr = 0;
+  bool have_nsa = false;
+
+  // env var?
+  /*int e_len = 0;
+  for (int i=0; envp[i]; i++)
+    e_len += strlen(envp[i]) + 1;
+  */
+  nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");    
+  if (nsaddr) {
+    while (nsaddr[0] != '=') nsaddr++;
+    nsaddr++;
+  }
+
+  else {
+    // file?
+    int fd = ::open(".ceph_ns",O_RDONLY);
+    if (fd > 0) {
+      ::read(fd, (void*)&nsa, sizeof(nsa));
+      ::close(fd);
+      have_nsa = true;
+      nsaddr = "from .ceph_ns";
+    }
+  }
+
+  if (!nsaddr && !have_nsa) {
+    cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl;
+    return -1;
+    //exit(-1);
+  }
+  
+  // look up nsaddr?
+  if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) {
+    return -1;
+  }
+
+  dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl;
+  return 0;
+}
+
+
+
+/** rankserver
+ *
+ * one per rank.  handles entity->rank lookup replies.
+ */
+
+class RankServer : public Dispatcher {
+public:
+  void dispatch(Message *m) {
+    lookup_lock.Lock();
+
+    dout(DBL) << "rankserver dispatching " << *m << endl;
+
+    switch (m->get_type()) {
+    case MSG_NS_CONNECTACK:
+      handle_connect_ack((MNSConnectAck*)m);
+      break;
+
+    case MSG_NS_REGISTERACK:
+      handle_register_ack((MNSRegisterAck*)m);
+      break;
+
+    case MSG_NS_LOOKUPREPLY:
+      handle_lookup_reply((MNSLookupReply*)m);
+      break;
+
+    default:
+      assert(0);
+    }
+
+    lookup_lock.Unlock();
+  }
+
+  void handle_connect_ack(MNSConnectAck *m) {
+    dout(DBL) << "my rank is " << m->get_rank();
+    my_rank = m->get_rank();
+
+    // now that i know my rank,
+    entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank; 
+    rank_addr[my_rank] = listen_addr;
+    
+    waiting_for_rank.SignalAll();
+
+    delete m;
+
+    // logger!
+    dout(DBL) << "logger" << endl;
+    char names[100];
+    sprintf(names, "rank%d", my_rank);
+    string name = names;
+
+    if (g_conf.tcp_log) {
+      logger = new Logger(name, (LogType*)&rank_logtype);
+      rank_logtype.add_set("num");
+      rank_logtype.add_inc("in");
+      rank_logtype.add_inc("inb");
+      rank_logtype.add_inc("dis");
+      rank_logtype.add_set("inq");
+      rank_logtype.add_set("inqb");
+      rank_logtype.add_set("outq");
+      rank_logtype.add_set("outqb");
+    }
+
+  }
+
+  void handle_register_ack(MNSRegisterAck *m) {
+    long tid = m->get_tid();
+    waiting_for_register_result[tid] = m->get_entity();
+    waiting_for_register_cond[tid]->Signal();
+    delete m;
+  }
+  
+  void handle_lookup_reply(MNSLookupReply *m) {
+    list<Message*> waiting;
+    dout(DBL) << "got lookup reply" << endl;
+
+    for (map<msg_addr_t, int>::iterator it = m->entity_rank.begin();
+         it != m->entity_rank.end();
+         it++) {
+      dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
+      entity_rank[it->first] = it->second;
+      
+      if (it->second == my_rank) {
+        // deliver locally
+        dout(-DBL) << "delivering lookup results locally" << endl;
+        incoming_lock.Lock();
+        
+        for (list<Message*>::iterator i = waiting_for_lookup[it->first].begin();
+             i != waiting_for_lookup[it->first].end();
+             i++) {
+          stat_inq++;
+          stat_inqb += (*i)->get_payload().length();
+          (*i)->decode_payload();
+          incoming.push_back(*i);
+        }
+        incoming_cond.Signal();
+        incoming_lock.Unlock();
+      } else {
+        // take waiters
+        waiting.splice(waiting.begin(), waiting_for_lookup[it->first]);
+      }
+      waiting_for_lookup.erase(it->first);
+      
+    }
+
+    for (map<int,tcpaddr_t>::iterator it = m->rank_addr.begin();
+         it != m->rank_addr.end();
+         it++) {
+      dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl;
+      rank_addr[it->first] = it->second;
+
+      // open it now
+      if (rank_sd.count(it->first) == 0)
+            tcp_open(it->first);
+    }
+
+    // send waiting messages
+#ifdef TCP_SERIALOUT
+    for (list<Message*>::iterator it = waiting.begin();
+         it != waiting.end();
+         it++) {
+      OutThread *outt = tcp_lookup(*it);
+      assert(outt);
+      tcp_send(*it);
+    }
+#else
+    for (list<Message*>::iterator it = waiting.begin();
+         it != waiting.end();
+         it++) {
+      OutThread *outt = tcp_lookup(*it);
+      assert(outt);
+      outt->send(*it);
+//      dout(0) << "lookup done, splicing in " << *it << endl;
+    }
+#endif
+
+    delete m;
+  }
+  
+} rankserver;
+
+
+class C_TCPKicker : public Context {
+  void finish(int r) {
+    dout(DBL) << "timer kick" << endl;
+    tcpmessenger_kick_dispatch_loop();
+  }
+};
+
+void TCPMessenger::callback_kick()
+{
+  tcpmessenger_kick_dispatch_loop();
+}
+
+
+extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta)
+{
+  char *host = str;
+  char *port = 0;
+  
+  for (int i=0; str[i]; i++) {
+    if (str[i] == ':') {
+      port = str+i+1;
+      str[i] = 0;
+      break;
+    }
+  }
+  if (!port) {
+    cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl;
+    return -1;
+  } 
+  //cout << "host '" << host << "' port '" << port << "'" << endl;
+
+  int iport = atoi(port);
+  
+  struct hostent *myhostname = gethostbyname( host ); 
+  if (!myhostname) {
+    cerr << "host " << host << " not found" << endl;
+    return -1;
+  }
+
+  memset(&ta, 0, sizeof(ta));
+
+  //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl;
+
+  ta.sin_family = myhostname->h_addrtype;
+  memcpy((char *)&ta.sin_addr,
+         myhostname->h_addr, 
+         myhostname->h_length);
+  ta.sin_port = iport;
+    
+  cout << "lookup '" << host << ":" << port << "' -> " << ta << endl;
+
+  return 0;
+}
+
+
+
+/*****
+ * global methods for process-wide startup, shutdown.
+ */
+
+int tcpmessenger_init()
+{
+  // LISTEN
+  dout(DBL) << "binding to listen " << endl;
+  
+  /* socket creation */
+  listen_sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(listen_sd > 0);
+  
+  /* bind to port */
+  memset((char*)&listen_addr, 0, sizeof(listen_addr));
+  listen_addr.sin_family = AF_INET;
+  listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  listen_addr.sin_port = 0;
+  
+  int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+  assert(rc >= 0);
+
+  socklen_t llen = sizeof(listen_addr);
+  getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+
+  int myport = listen_addr.sin_port;
+
+  // listen!
+  rc = ::listen(listen_sd, 1000);
+  assert(rc >= 0);
+
+  dout(DBL) << "listening on " << myport << endl;
+  
+  // my address is...
+  char host[100];
+  gethostname(host, 100);
+  dout(DBL) << "my hostname is " << host << endl;
+
+  struct hostent *myhostname = gethostbyname( host ); 
+
+  struct sockaddr_in my_addr;  
+  memset(&my_addr, 0, sizeof(my_addr));
+
+  my_addr.sin_family = myhostname->h_addrtype;
+  memcpy((char *) &my_addr.sin_addr.s_addr, 
+         myhostname->h_addr_list[0], 
+         myhostname->h_length);
+  my_addr.sin_port = myport;
+
+  listen_addr = my_addr;
+
+  dout(DBL) << "listen addr is " << listen_addr << endl;
+
+  // register to execute timer events
+  //g_timer.set_messenger_kicker(new C_TCPKicker());
+
+
+  dout(DBL) << "init done" << endl;
+  return 0;
+}
+
+
+// on first rank only
+void tcpmessenger_start_nameserver(tcpaddr_t& diraddr)
+{
+  dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl;
+
+  // i am rank 0.
+  nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY);
+
+  // start name server
+  nameserver = new TCPDirectory(nsmessenger);
+
+  // diraddr is my addr!
+  diraddr = rank_addr[0] = listen_addr;
+  my_rank = 0;
+  entity_rank[MSG_ADDR_DIRECTORY] = 0;
+}
+void tcpmessenger_stop_nameserver()
+{
+  if (nsmessenger) {
+    dout(DBL) << "shutting down nsmessenger" << endl;
+    TCPMessenger *m = nsmessenger;
+    nsmessenger = 0;
+    m->shutdown();
+    delete m;
+  }
+}
+
+// on all ranks
+void tcpmessenger_start_rankserver(tcpaddr_t& ns)
+{
+  // connect to nameserver
+  entity_rank[MSG_ADDR_DIRECTORY] = 0;
+  rank_addr[0] = ns;
+  tcp_open(0);
+
+  if (my_rank >= 0) {
+    // i know my rank
+    rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank));
+  } else {
+    // start rank messenger, and discover my rank.
+    rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW);
+  }
+}
+void tcpmessenger_stop_rankserver()
+{
+  if (rankmessenger) {
+    dout(DBL) << "shutting down rankmessenger" << endl;
+    rankmessenger->shutdown();
+    delete rankmessenger;
+    rankmessenger = 0;
+  }
+}
+
+
+
+
+
+
+int tcpmessenger_shutdown() 
+{
+  dout(DBL) << "tcpmessenger_shutdown barrier" << endl;
+
+
+  dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl;
+
+  // bleh
+  for (hash_map<int,int>::iterator it = rank_sd.begin();
+       it != rank_sd.end();
+       it++) {
+    ::close(it->second);
+  }
+
+  return 0;
+}
+
+
+
+
+/***
+ * internal send/recv
+ */
+
+
+
+
+/*
+ * recv a Message*
+ */
+
+
+
+Message *tcp_recv(int sd)
+{
+  // envelope
+  dout(DBL) << "tcp_recv receiving message from sd " << sd  << endl;
+  
+  msg_envelope_t env;
+  if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+    return 0;
+
+  if (env.type == 0) {
+    dout(DBL) << "got dummy env, bailing" << endl;
+    return 0;
+  }
+
+  dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl;
+  
+  // payload
+  bufferlist blist;
+  for (int i=0; i<env.nchunks; i++) {
+    int size;
+    tcp_read( sd, (char*)&size, sizeof(size) );
+
+    bufferptr bp = new buffer(size);
+    
+    if (!tcp_read( sd, bp.c_str(), size )) return 0;
+
+    blist.push_back(bp);
+
+    dout(DBL) << "tcp_recv got frag " << i << " of " << env.nchunks << " len " << bp.length() << endl;
+  }
+  
+  // unmarshall message
+  size_t s = blist.length();
+  Message *m = decode_message(env, blist);
+
+  if (logger) {
+    logger->inc("in");
+    logger->inc("inb", s+sizeof(env));
+  }
+
+  dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl;
+
+  return m;
+}
+
+
+
+
+void tcp_open(int rank)
+{
+  dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl;
+
+  // create socket?
+  int sd = socket(AF_INET,SOCK_STREAM,0);
+  assert(sd > 0);
+  
+  // bind any port
+  struct sockaddr_in myAddr;
+  myAddr.sin_family = AF_INET;
+  myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  myAddr.sin_port = htons( 0 );    
+  
+  int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+  assert(rc>=0);
+
+  // connect!
+  int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr));
+  assert(r >= 0);
+
+  //dout(DBL) << "tcp_open connected to " << who << endl;
+  assert(rank_sd.count(rank) == 0);
+  rank_sd[rank] = sd;
+
+  if (g_conf.tcp_multi_out) {
+    rank_out[rank] = new OutThread();
+    rank_out[rank]->create();
+  } else {
+    rank_out[rank] = &single_out_thread;
+    if (!single_out_thread.is_started())
+      single_out_thread.create();
+  }
+}
+
+
+void tcp_marshall(Message *m)
+{
+  // marshall
+  if (m->empty_payload())
+    m->encode_payload();
+}
+
+OutThread *tcp_lookup(Message *m)
+{
+  msg_addr_t addr = m->get_dest();
+
+  if (!entity_rank.count(m->get_dest())) {
+    // lookup and wait.
+    if (waiting_for_lookup.count(addr)) {
+      dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl;
+    } else {
+      dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl;
+      MNSLookup *r = new MNSLookup(addr);
+      rankmessenger->send_message(r, MSG_ADDR_DIRECTORY);
+    }
+    
+    // add waiter
+    waiting_for_lookup[addr].push_back(m);
+    return 0;
+  }
+
+  int rank = entity_rank[m->get_dest()];
+
+  if (rank_sd.count(rank) == 0) { // should only happen on rank0?
+    tcp_open(rank);
+  } 
+  assert(rank_sd.count(rank));
+  m->set_tcp_sd( rank_sd[rank] );
+  return rank_out[rank];
+}
+
+
+/*
+ * send a Message* over the wire.  ** do not block **.
+ */
+int tcp_send(Message *m)
+{
+  /*int rank = entity_rank[m->get_dest()];
+  //if (rank_sd.count(rank) == 0) tcp_open(rank);
+  assert(rank_sd.count(rank));
+
+  int sd = rank_sd[rank];
+  assert(sd);
+  */
+  int sd = m->get_tcp_sd();
+  assert(sd);
+
+  // get envelope, buffers
+  msg_envelope_t *env = &m->get_envelope();
+  bufferlist blist;
+  blist.claim( m->get_payload() );
+
+#ifdef TCP_KEEP_CHUNKS
+  env->nchunks = blist.buffers().size();
+#else
+  env->nchunks = 1;
+#endif
+
+  // HACK osd -> client only
+  //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) &&
+ // m->get_dest() >= MSG_ADDR_CLIENT(0))
+  dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) 
+      //<< " rank " << rank 
+            << " sd " << sd << endl;
+  
+  // send envelope
+  int r = tcp_write( sd, (char*)env, sizeof(*env) );
+  if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+
+  // payload
+#ifdef TCP_KEEP_CHUNKS
+  // send chunk-wise
+  int i = 0;
+  for (list<bufferptr>::iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl;
+    int size = (*it).length();
+    r = tcp_write( sd, (char*)&size, sizeof(size) );
+    if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+    r = tcp_write( sd, (*it).c_str(), size );
+    if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+    i++;
+  }
+#else
+  // one big chunk
+  int size = blist.length();
+  r = tcp_write( sd, (char*)&size, sizeof(size) );
+  if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+  for (list<bufferptr>::iterator it = blist.buffers().begin();
+       it != blist.buffers().end();
+       it++) {
+    r = tcp_write( sd, (*it).c_str(), (*it).length() );
+    if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); }
+  }
+#endif
+
+  // hose message
+  delete m;
+  return 0;
+}
+
+
+
+
+
+/** tcp_outthread
+ * this thread watching the outgoing queue, and encodes+sends any queued messages
+ */
+
+void* OutThread::entry() 
+{
+  lock.Lock();
+  while (!q.empty() || !done) {
+    
+    if (!q.empty()) {
+      dout(DBL) << "outthread grabbing message(s)" << endl;
+      
+      // grab outgoing list
+      list<Message*> out;
+      out.splice(out.begin(), q);
+      
+      // drop lock while i send these
+      lock.Unlock();
+      
+      while (!out.empty()) {
+        Message *m = out.front();
+        out.pop_front();
+        
+        dout(DBL) << "outthread sending " << m << endl;
+        
+        if (!g_conf.tcp_serial_marshall) 
+          tcp_marshall(m);
+        
+        tcp_send(m);
+      }
+      
+      lock.Lock();
+      continue;
+    }
+    
+    // wait
+    dout(DBL) << "outthread sleeping" << endl;
+    cond.Wait(lock);
+  }
+  dout(DBL) << "outthread done" << endl;
+  
+  lock.Unlock();  
+  return 0;
+}
+
+
+
+/** tcp_inthread
+ * read incoming messages from a given peer.
+ * give received and decoded messages to dispatch loop.
+ */
+void *tcp_inthread(void *r)
+{
+  int sd = (int)r;
+
+  dout(DBL) << "tcp_inthread reading on sd " << sd << endl;
+
+  while (!tcp_done) {
+    Message *m = tcp_recv(sd);
+    if (!m) break;
+    msg_addr_t who = m->get_source();
+
+    dout(20) << g_clock.now() <<  " inthread got " << m << " from sd " << sd << " who is " << who << endl;
+
+    // give to dispatch loop
+    size_t sz = m->get_payload().length();
+
+    if (g_conf.tcp_multi_dispatch) {
+      const msg_addr_t dest = m->get_dest();
+      directory_lock.Lock();
+      TCPMessenger *messenger = directory[ dest ];
+      directory_lock.Unlock();
+
+      if (messenger) 
+        messenger->dispatch_queue(m);
+      else
+        dout(0) << "dest " << dest << " dne" << endl;
+
+    } else {
+      // single dispatch queue
+      incoming_lock.Lock();
+      {
+        //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+        //assert(stat_inq == incoming.size());
+        incoming.push_back(m);
+        incoming_cond.Signal();
+        
+        stat_inq++;
+        //assert(stat_inq == incoming.size());
+        //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+        stat_inqb += sz;
+      }
+      incoming_lock.Unlock();
+    }
+
+    if (logger) {
+      //logger->inc("in");
+      //logger->inc("inb", sz);
+    }
+  }
+
+  dout(DBL) << "tcp_inthread closing " << sd << endl;
+
+  //::close(sd);
+  return 0;  
+}
+
+/** tcp_accepthread
+ * accept incoming connections from peers.
+ * start a tcp_inthread for each.
+ */
+void *tcp_acceptthread(void *)
+{
+  dout(DBL) << "tcp_acceptthread starting" << endl;
+
+  while (!tcp_done) {
+    //dout(DBL) << "accepting, left = " << left << endl;
+
+    struct sockaddr_in addr;
+    socklen_t slen = sizeof(addr);
+    int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+    if (sd > 0) {
+      dout(DBL) << "accepted incoming on sd " << sd << endl;
+
+      pthread_t th;
+      pthread_create(&th,
+                     NULL,
+                     tcp_inthread,
+                     (void*)sd);
+      in_threads[sd] = th;
+    } else {
+      dout(DBL) << "no incoming connection?" << endl;
+      break;
+    }
+  }
+  return 0;
+}
+
+
+
+
+/** tcp_dispatchthread
+ * wait for pending timers, incoming messages.  dispatch them.
+ */
+void TCPMessenger::dispatch_entry()
+{
+  incoming_lock.Lock();
+  while (!incoming.empty() || !incoming_stop) {
+    if (!incoming.empty()) {
+      // grab incoming messages  
+      list<Message*> in;
+      in.splice(in.begin(), incoming);
+
+      assert(stat_disq == 0);
+      stat_disq = stat_inq;
+      stat_disqb = stat_inqb;
+      stat_inq = 0;
+      stat_inqb = 0;
+    
+      // drop lock while we deliver
+      //assert(stat_inq == incoming.size());
+      incoming_lock.Unlock();
+
+      // dispatch!
+      while (!in.empty()) {
+        Message *m = in.front();
+        in.pop_front();
+      
+        stat_disq--;
+        stat_disqb -= m->get_payload().length();
+        if (logger) {
+          logger->set("inq", stat_inq+stat_disq);
+          logger->set("inqb", stat_inqb+stat_disq);
+          logger->inc("dis");
+        }
+          
+        dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << 
+          "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+          " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " 
+                << m 
+                << endl;
+        
+        dispatch(m);
+      }
+
+      continue;
+    }
+
+    // sleep
+    dout(DBL) << "dispatch: waiting for incoming messages" << endl;
+    incoming_cond.Wait(incoming_lock);
+    dout(DBL) << "dispatch: woke up" << endl;
+  }
+  incoming_lock.Unlock();
+}
+
+
+void* tcp_dispatchthread(void*)
+{
+  dout(5) << "tcp_dispatchthread start pid " << getpid() << endl;
+
+  while (1) {
+    // inq?
+    incoming_lock.Lock();
+
+    // done?
+    if (tcp_done && incoming.empty()) {
+      incoming_lock.Unlock();
+      break;
+    }
+
+    // wait?
+    if (incoming.empty()) {
+      // wait
+      dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl;
+      incoming_cond.Wait(incoming_lock);
+      dout(DBL) << "dispatch: woke up" << endl;
+    }
+
+    // grab incoming messages  
+    //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+    //assert(stat_inq == incoming.size());
+
+    list<Message*> in;
+    in.splice(in.begin(), incoming);
+
+    assert(stat_disq == 0);
+    stat_disq = stat_inq;
+    stat_disqb = stat_inqb;
+    stat_inq = 0;
+    stat_inqb = 0;
+    //assert(stat_inq == incoming.size());
+    //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+
+    // drop lock while we deliver
+    incoming_lock.Unlock();
+
+    // dispatch!
+    while (!in.empty()) {
+      Message *m = in.front();
+      in.pop_front();
+      
+      stat_disq--;
+      stat_disqb -= m->get_payload().length();
+      if (logger) {
+        logger->set("inq", stat_inq+stat_disq);
+        logger->set("inqb", stat_inqb+stat_disq);
+        logger->inc("dis");
+      }
+      
+      dout(DBL) << "dispatch doing " << *m << endl;
+      
+      // for rankserver?
+      if (m->get_type() == MSG_NS_CONNECTACK ||        // i just connected
+          m->get_dest() == MSG_ADDR_RANK(my_rank)) {
+        dout(DBL) <<  " giving to rankserver" << endl;
+        rankserver.dispatch(m);
+        continue;
+      }
+      
+      // ok
+      msg_addr_t dest = m->get_dest();
+      directory_lock.Lock();
+      if (directory.count(dest)) {
+        Messenger *who = directory[ dest ];
+        directory_lock.Unlock();          
+        
+        dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << 
+          "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+          " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " 
+                << *m 
+                << endl;
+        
+        who->dispatch(m);
+      } else {
+        directory_lock.Unlock();
+        dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl;
+        assert(0);
+      }
+    }
+    assert(stat_disq == 0);
+
+  }
+
+
+  g_timer.shutdown();
+
+  dout(5) << "tcp_dispatchthread exiting loop" << endl;
+  return 0;
+}
+
+
+// start/stop mpi receiver thread (for unsolicited messages)
+int tcpmessenger_start()
+{
+  dout(5) << "starting accept thread" << endl;
+  pthread_create(&listen_thread_id,
+                 NULL,
+                 tcp_acceptthread,
+                 0);                
+
+  dout(5) << "starting dispatch thread" << endl;
+  
+  // start a thread
+  pthread_create(&dispatch_thread_id, 
+                 NULL, 
+                 tcp_dispatchthread,
+                 0);
+
+
+  /*
+  dout(5) << "starting outgoing thread" << endl;
+  pthread_create(&out_thread_id, 
+                 NULL, 
+                 tcp_outthread,
+                 0);
+  */
+  if (!g_conf.tcp_multi_out)
+    single_out_thread.create();
+  return 0;
+}
+
+
+/*
+ * kick and wake up _loop (to pick up new outgoing message, or quit)
+ */
+
+void tcpmessenger_kick_dispatch_loop()
+{
+  if (g_conf.tcp_multi_dispatch) {
+    assert(0);
+    // all of them
+    /*for (hash_map<msg_addr_t, TCPMessenger*>::iterator i = directory.begin();
+         i != directory.end();
+         i++)
+      i->second->dispatch_kick();
+    */
+  } else {
+    // just one
+    dout(DBL) << "kicking" << endl;
+    incoming_lock.Lock();
+    dout(DBL) << "prekick" << endl;
+    incoming_cond.Signal();
+    incoming_lock.Unlock();
+    dout(DBL) << "kicked" << endl;
+  }
+}
+
+/*
+void tcpmessenger_kick_outgoing_loop()
+{
+  outgoing_lock.Lock();
+  outgoing_cond.Signal();
+  outgoing_lock.Unlock();
+}
+*/
+
+
+// wait for thread to finish
+
+void tcpmessenger_wait()
+{
+  if (g_conf.tcp_multi_dispatch) {
+    // new way
+    incoming_lock.Lock();
+    while (!tcp_done)
+      incoming_cond.Wait(incoming_lock);
+    incoming_lock.Unlock();
+  } else {
+    // old way
+    dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl;
+    tcpmessenger_kick_dispatch_loop();
+    
+    void *returnval;
+    dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl;
+    pthread_join(dispatch_thread_id, &returnval);
+    dout(10) << "tcpmessenger_wait thread finished." << endl;
+  }
+}
+
+
+
+
+msg_addr_t register_entity(msg_addr_t addr) 
+{
+  lookup_lock.Lock();
+  
+  // prepare to wait
+  long id = ++regid;
+  Cond cond;
+  waiting_for_register_cond[id] = &cond;
+
+  if (my_rank < 0) {
+    dout(DBL) << "register_entity don't know my rank, connecting" << endl;
+    
+    // connect to nameserver; discover my rank.
+    Message *m = new MNSConnect(listen_addr);
+    m->set_dest(MSG_ADDR_DIRECTORY, 0);
+    tcp_marshall(m);
+    OutThread *outt = tcp_lookup(m);
+    assert(outt);
+    tcp_send(m);
+    
+    // wait for reply
+    while (my_rank < 0) 
+      waiting_for_rank.Wait(lookup_lock);
+    assert(my_rank > 0);
+  }
+  
+  // send req
+  dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl;
+  Message *m = new MNSRegister(addr, my_rank, id);
+  m->set_dest(MSG_ADDR_DIRECTORY, 0);
+  tcp_marshall(m);
+  OutThread *outt = tcp_lookup(m);
+  assert(outt);
+  tcp_send(m);
+  
+  // wait?
+  while (!waiting_for_register_result.count(id)) 
+    cond.Wait(lookup_lock);
+
+  // get result, clean up
+  msg_addr_t entity = waiting_for_register_result[id];
+  waiting_for_register_result.erase(id);
+  waiting_for_register_cond.erase(id);
+  
+  dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl;
+
+  lookup_lock.Unlock();
+
+  // ok!
+  return entity;
+}
+
+
+
+/***********
+ * Tcpmessenger class implementation
+ */
+
+
+TCPMessenger::TCPMessenger(msg_addr_t myaddr) : 
+  Messenger(myaddr), 
+  dispatch_thread(this)
+{
+  if (myaddr != MSG_ADDR_DIRECTORY) {
+    // register!
+    myaddr = register_entity(myaddr);
+  }
+
+
+  // my address
+  set_myaddr( myaddr );
+
+  // register myself in the messenger directory
+  directory_lock.Lock();
+  {
+    directory[myaddr] = this;
+    
+    stat_num++;
+    if (logger) logger->set("num", stat_num);
+  }
+  directory_lock.Unlock();
+
+  // register to execute timer events
+  //g_timer.set_messenger_kicker(new C_TCPKicker());
+  //  g_timer.set_messenger(this);
+}
+
+
+void TCPMessenger::ready()
+{
+  directory_lock.Lock();
+  directory_ready.insert(get_myaddr());
+  directory_lock.Unlock();
+
+  if (get_myaddr() != MSG_ADDR_DIRECTORY) {
+    // started!  tell namer we are up and running.
+    lookup_lock.Lock();
+    {
+      Message *m = new MGenericMessage(MSG_NS_STARTED);
+      m->set_source(get_myaddr(), 0);
+      m->set_dest(MSG_ADDR_DIRECTORY, 0);
+      tcp_marshall(m);
+      OutThread *outt = tcp_lookup(m);
+      assert(outt);
+      tcp_send(m);
+    }
+    lookup_lock.Unlock();
+  }
+}
+
+
+TCPMessenger::~TCPMessenger()
+{
+  //delete logger;
+}
+
+tcpaddr_t& TCPMessenger::get_tcpaddr() 
+{
+  return listen_addr;
+}
+
+void TCPMessenger::map_entity_rank(msg_addr_t e, int r)
+{
+  lookup_lock.Lock();
+  entity_rank[e] = r;
+  lookup_lock.Unlock();
+}
+
+void TCPMessenger::map_rank_addr(int r, tcpaddr_t a)
+{
+  lookup_lock.Lock();
+  rank_addr[r] = a;
+  lookup_lock.Unlock();
+}
+
+
+int TCPMessenger::get_dispatch_queue_len() 
+{
+  return stat_inq+stat_disq;
+}
+
+
+int TCPMessenger::shutdown()
+{
+  dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl;
+
+  // dont' send unregistery from nsmessenger shutdown!
+  if (this != nsmessenger && 
+      (my_rank > 0 || nsmessenger)) {
+    dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl;
+    send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+                 MSG_ADDR_DIRECTORY);
+  }
+
+  // remove me from the directory
+  directory_lock.Lock();
+  directory.erase(get_myaddr());
+  
+  // last one?
+  bool lastone = directory.empty();  
+  //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl;
+  
+
+  // or almost last one?
+  if (rankmessenger && directory.size() == 1) {
+    directory_lock.Unlock();
+    tcpmessenger_stop_rankserver();
+    directory_lock.Lock();
+  }
+
+  stat_num--;
+  if (logger) logger->set("num", stat_num);
+
+  directory_lock.Unlock();
+
+  // last one?
+  if (lastone) {
+    dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl;
+    //pthread_t whoami = pthread_self();
+    
+    // no more timer events
+    //g_timer.unset_messenger();
+    
+    // close incoming sockets
+    //void *r;
+    for (map<int,pthread_t>::iterator it = in_threads.begin();
+         it != in_threads.end();
+         it++) {
+      dout(DBL) << "closing reader on sd " << it->first << endl;      
+      ::close(it->first);
+      //pthread_join(it->second, &r);
+    }
+
+    if (g_conf.tcp_multi_dispatch) {
+      // kill off dispatch threads
+      dout(DBL) << "killing dispatch threads" << endl;
+      for (hash_map<msg_addr_t,TCPMessenger*>::iterator it = directory.begin();
+           it != directory.end();
+           it++) 
+        it->second->dispatch_stop();
+    }
+
+    dout(DBL) << "setting tcp_done" << endl;
+
+    // kick/kill incoming thread
+    incoming_lock.Lock();
+    tcp_done = true;
+    incoming_cond.Signal();
+    incoming_lock.Unlock();
+
+    // finish off outgoing thread
+    dout(10) << "waiting for outgoing to finish" << endl;
+    if (g_conf.tcp_multi_out) {
+      for (hash_map<int,OutThread*>::iterator it = rank_out.begin();
+           it != rank_out.end();
+           it++) {
+        it->second->stop();
+        delete it->second;
+      }
+    } else {
+      single_out_thread.stop();
+    }
+
+    
+    /*
+
+    dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl;
+    if (whoami == thread_id) {
+      // i am the event loop thread, just set flag!
+      dout(15) << "  set tcp_done=true" << endl;
+      tcp_done = true;
+    }
+    */
+  } 
+  return 0;
+}
+
+
+
+
+/***
+ * public messaging interface
+ */
+
+
+/* note: send_message _MUST_ be non-blocking */
+int TCPMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+  // set envelope
+  m->set_source(get_myaddr(), fromport);
+  m->set_dest(dest, port);
+  m->set_lamport_send_stamp( get_lamport() );
+
+  dout(4) << "--> " << m->get_type_name() 
+          << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() 
+          << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() 
+          << " ---- " << m 
+          << endl;
+
+  // local?
+  TCPMessenger *entity = 0;
+  directory_lock.Lock();
+  if (directory.count(dest) &&
+      directory_ready.count(dest)) entity = directory[dest];
+  directory_lock.Unlock();
+  
+  if (entity) {  
+    // local!
+    ::incoming_lock.Lock();
+    {
+      dout(20) << " queueing locally for " << dest << " " << m << endl;  //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl;
+      //assert(stat_inq == ::incoming.size());
+      ::incoming.push_back(m);
+      ::incoming_cond.Signal();
+      stat_inq++;
+      //assert(stat_inq == ::incoming.size());
+      //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl;
+      stat_inqb += m->get_payload().length();
+    }
+    ::incoming_lock.Unlock();
+  } else {
+    // remote!
+
+    if (g_conf.tcp_serial_marshall)
+      tcp_marshall(m);
+    
+    if (g_conf.tcp_serial_out) {
+      lookup_lock.Lock();
+      // send in this thread
+      if (tcp_lookup(m))
+        tcp_send(m);
+      lookup_lock.Unlock();
+    } else {
+      lookup_lock.Lock();
+      OutThread *outt = tcp_lookup(m);
+      lookup_lock.Unlock();
+      
+      if (outt) outt->send(m);
+    }
+  }
+
+  return 0;
+}
+
+
+
+
diff --git a/branches/sage/cephmds2/msg/TCPMessenger.h b/branches/sage/cephmds2/msg/TCPMessenger.h
new file mode 100644
index 0000000000000..5cafbe470214b
--- /dev/null
+++ b/branches/sage/cephmds2/msg/TCPMessenger.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __TCPMESSENGER_H
+#define __TCPMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+#include "common/Thread.h"
+
+#include "tcp.h"
+
+class Timer;
+
+
+class TCPMessenger : public Messenger {
+ protected:
+
+  //class Logger *logger;  // for logging
+  
+  bool           incoming_stop;
+  Mutex          incoming_lock;
+  list<Message*> incoming;
+  Cond           incoming_cond;
+
+  class DispatchThread : public Thread {
+    TCPMessenger *m;
+  public:
+    DispatchThread(TCPMessenger *_m) : m(_m) {}
+    void *entry() {
+      m->dispatch_entry();
+      return 0;
+    }
+  } dispatch_thread;
+
+  void dispatch_entry();
+
+public:
+  void dispatch_start() {
+    incoming_stop = false;
+    dispatch_thread.create();
+  }
+  /*  void dispatch_kick() {
+    incoming_lock.Lock();
+    incoming_cond.Signal();
+    incoming_lock.Unlock();
+    }*/
+  void dispatch_stop() {
+    incoming_lock.Lock();
+    incoming_stop = true;
+    incoming_cond.Signal();
+    incoming_lock.Unlock();
+    dispatch_thread.join();
+  }
+  void dispatch_queue(Message *m) {
+    incoming_lock.Lock();
+    incoming.push_back(m);
+    incoming_cond.Signal();
+    incoming_lock.Unlock();
+  }
+
+ public:
+  TCPMessenger(msg_addr_t myaddr);
+  ~TCPMessenger();
+
+  void ready();
+
+  tcpaddr_t& get_tcpaddr();
+  void map_entity_rank(msg_addr_t e, int r);
+  void map_rank_addr(int r, tcpaddr_t a);
+
+  int get_dispatch_queue_len();
+
+  void callback_kick();
+
+  // init, shutdown MPI and associated event loop thread.
+  virtual int shutdown();
+
+  // message interface
+  virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+};
+
+/**
+ * these are all ONE per process.
+ */
+
+extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta);
+
+extern int tcpmessenger_findns(tcpaddr_t &nsa);
+
+extern int tcpmessenger_init();
+extern int tcpmessenger_start();   // start thread
+extern void tcpmessenger_wait();    // wait for thread to finish.
+extern int tcpmessenger_shutdown();   // finalize MPI
+
+extern void tcpmessenger_start_nameserver(tcpaddr_t& ta);  // on rank 0
+extern void tcpmessenger_stop_nameserver();   // on rank 0
+extern void tcpmessenger_start_rankserver(tcpaddr_t& ta);  // on all ranks
+extern void tcpmessenger_stop_rankserver();   // on all ranks
+
+extern int tcpmessenger_get_rank();
+
+
+#endif
diff --git a/branches/sage/cephmds2/msg/error.c b/branches/sage/cephmds2/msg/error.c
new file mode 100644
index 0000000000000..15cd16a2ca9da
--- /dev/null
+++ b/branches/sage/cephmds2/msg/error.c
@@ -0,0 +1,77 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "include/error.h"
+
+#define EXIT_USAGE_ERROR -1	/* error codes for program exit */
+#define EXIT_SYSTEM_ERROR -2
+#define EXIT_GENERIC_ERROR -3
+#define MSGSIZ 1024		/* maximum error message length */
+
+/* print usage error message and exit */
+void userror(const char *use, const char *fmt, ...)
+{
+  char msg[MSGSIZ];
+  int len;
+
+  va_list ap;
+  va_start(ap, fmt);
+
+  len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+  len += snprintf(msg+len, MSGSIZ-len, "\n");
+  len += snprintf(msg+len, MSGSIZ-len, use);
+  fprintf(stderr, "%s\n", msg);
+  exit(EXIT_USAGE_ERROR);
+
+  va_end(ap);
+}
+
+/* print system error message and exit */
+void syserror(const char *fmt, ...)
+{
+  char msg[MSGSIZ];
+  int len;
+
+  va_list ap;
+  va_start(ap, fmt);
+
+  len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+  len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno));
+  fprintf(stderr, "%s", msg);
+  exit(EXIT_SYSTEM_ERROR);
+
+  va_end(ap);
+}
+
+/* print error message and exit */
+void exiterror(const char *fmt, ...)
+{
+  char msg[MSGSIZ];
+  int len;
+
+  va_list ap;
+  va_start(ap, fmt);
+
+  len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+  fprintf(stderr, "%s\n", msg);
+  exit(EXIT_GENERIC_ERROR);
+
+  va_end(ap);
+}
+
+/* print error message */
+void error(const char *fmt, ...)
+{
+  char msg[MSGSIZ];
+  int len;
+
+  va_list ap;
+  va_start(ap, fmt);
+
+  len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+  fprintf(stderr, "%s\n", msg);
+
+  va_end(ap);
+}
diff --git a/branches/sage/cephmds2/msg/mpistarter.cc b/branches/sage/cephmds2/msg/mpistarter.cc
new file mode 100644
index 0000000000000..79391f78210d2
--- /dev/null
+++ b/branches/sage/cephmds2/msg/mpistarter.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <mpi.h>
+
+#include "TCPMessenger.h"
+
+/*
+ * start up TCPMessenger via MPI.
+ */ 
+
+pair<int,int> mpi_bootstrap_tcp(int& argc, char**& argv)
+{
+  tcpmessenger_init();
+  tcpmessenger_start();
+
+  // exchnage addresses with other nodes
+  MPI_Init(&argc, &argv);
+  
+  int mpi_world;
+  int mpi_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+  //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl;
+  
+  // start up directory?
+  tcpaddr_t ta;
+  if (mpi_rank == 0) {
+    dout(30) << "i am rank 0, starting ns directory" << endl;
+    tcpmessenger_start_nameserver(ta);
+  } else {
+    memset(&ta, 0, sizeof(ta));
+  }
+
+  // distribute tcpaddr
+  int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR,
+                    0, MPI_COMM_WORLD);
+
+  dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl;
+  tcpmessenger_start_rankserver(ta);
+  
+  MPI_Barrier(MPI_COMM_WORLD);
+  //g_clock.tare();
+  MPI_Finalize();
+
+  return pair<int,int>(mpi_rank, mpi_world);
+}
+
+
diff --git a/branches/sage/cephmds2/msg/new_mpistarter.cc b/branches/sage/cephmds2/msg/new_mpistarter.cc
new file mode 100644
index 0000000000000..fc9da720f19ee
--- /dev/null
+++ b/branches/sage/cephmds2/msg/new_mpistarter.cc
@@ -0,0 +1,43 @@
+#include <mpi.h>
+#include "NewMessenger.h"
+
+/*
+ * start up NewMessenger via MPI.
+ */ 
+
+pair<int,int> mpi_bootstrap_new(int& argc, char**& argv)
+{
+  MPI_Init(&argc, &argv);
+  
+  int mpi_world;
+  int mpi_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+  tcpaddr_t nsaddr;
+  memset(&nsaddr, 0, sizeof(nsaddr));
+  
+  if (mpi_rank == 0) {
+    // i am root.
+    rank.my_rank = 0;  
+    rank.start_rank(nsaddr);
+    nsaddr = rank.get_listen_addr();
+  }
+
+  int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR,
+                    0, MPI_COMM_WORLD);
+
+  dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl;
+
+  if (mpi_rank != 0) {
+    rank.start_rank(nsaddr);
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  //g_clock.tare();
+
+  MPI_Finalize();
+
+  return pair<int,int>(mpi_rank, mpi_world);
+}
diff --git a/branches/sage/cephmds2/msg/tcp.cc b/branches/sage/cephmds2/msg/tcp.cc
new file mode 100644
index 0000000000000..1a448a91cb2c6
--- /dev/null
+++ b/branches/sage/cephmds2/msg/tcp.cc
@@ -0,0 +1,87 @@
+
+#include "tcp.h"
+
+/******************
+ * tcp crap
+ */
+
+bool tcp_read(int sd, char *buf, int len)
+{
+  while (len > 0) {
+    int got = ::recv( sd, buf, len, 0 );
+    if (got == 0) {
+      dout(18) << "tcp_read socket " << sd << " closed" << endl;
+      return false;
+    }
+    if (got < 0) {
+      dout(18) << "tcp_read bailing with " << got << endl;
+      return false;
+    }
+    assert(got >= 0);
+    len -= got;
+    buf += got;
+    //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl;
+  }
+  return true;
+}
+
+int tcp_write(int sd, char *buf, int len)
+{
+  //dout(DBL) << "tcp_write writing " << len << endl;
+  assert(len > 0);
+  while (len > 0) {
+    int did = ::send( sd, buf, len, 0 );
+    if (did < 0) {
+      dout(1) << "tcp_write error did = " << did << "  errno " << errno << " " << strerror(errno) << endl;
+      //cerr << "tcp_write error did = " << did << "  errno " << errno << " " << strerror(errno) << endl;
+    }
+    //assert(did >= 0);
+    if (did < 0) return did;
+    len -= did;
+    buf += did;
+    //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl;
+  }
+  return 0;
+}
+
+
+int tcp_hostlookup(char *str, tcpaddr_t& ta)
+{
+  char *host = str;
+  char *port = 0;
+  
+  for (int i=0; str[i]; i++) {
+    if (str[i] == ':') {
+      port = str+i+1;
+      str[i] = 0;
+      break;
+    }
+  }
+  if (!port) {
+    cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl;
+    return -1;
+  } 
+  //cout << "host '" << host << "' port '" << port << "'" << endl;
+
+  int iport = atoi(port);
+  
+  struct hostent *myhostname = gethostbyname( host ); 
+  if (!myhostname) {
+    cerr << "host " << host << " not found" << endl;
+    return -1;
+  }
+
+  memset(&ta, 0, sizeof(ta));
+
+  //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl;
+
+  ta.sin_family = myhostname->h_addrtype;
+  memcpy((char *)&ta.sin_addr,
+         myhostname->h_addr, 
+         myhostname->h_length);
+  ta.sin_port = iport;
+    
+  cout << "lookup '" << host << ":" << port << "' -> " << ta << endl;
+
+  return 0;
+}
diff --git a/branches/sage/cephmds2/msg/tcp.h b/branches/sage/cephmds2/msg/tcp.h
new file mode 100644
index 0000000000000..f38388d456a8c
--- /dev/null
+++ b/branches/sage/cephmds2/msg/tcp.h
@@ -0,0 +1,37 @@
+#ifndef __TCP_H
+#define __TCP_H
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+typedef struct sockaddr_in tcpaddr_t;
+
+using std::ostream;
+
+inline ostream& operator<<(ostream& out, const tcpaddr_t &a)
+{
+  unsigned char addr[4];
+  memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4);
+  out << (unsigned)addr[0] << "."
+      << (unsigned)addr[1] << "."
+      << (unsigned)addr[2] << "."
+      << (unsigned)addr[3] << ":"
+      << (int)a.sin_port;
+  return out;
+}
+
+extern bool tcp_read(int sd, char *buf, int len);
+extern int tcp_write(int sd, char *buf, int len);
+extern int tcp_hostlookup(char *str, tcpaddr_t& ta);
+
+inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) {
+  return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0;
+}
+inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) {
+  return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0;
+}
+
+
+#endif
diff --git a/branches/sage/cephmds2/newsyn.cc b/branches/sage/cephmds2/newsyn.cc
new file mode 100644
index 0000000000000..43fd1b2373391
--- /dev/null
+++ b/branches/sage/cephmds2/newsyn.cc
@@ -0,0 +1,420 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+#include "client/SyntheticClient.h"
+
+#include "msg/NewerMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test->finish(" << r << ")" << endl;
+  }
+};
+
+
+/*
+ * start up NewMessenger via MPI.
+ */ 
+#include <mpi.h>
+
+pair<int,int> mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap)
+{
+  MPI_Init(&argc, &argv);
+  
+  int mpi_world;
+  int mpi_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+  // first, synchronize clocks.
+  MPI_Barrier(MPI_COMM_WORLD);
+  //dout(-10) << "tare" << endl;
+  g_clock.tare();
+  
+  // start up all monitors at known addresses.
+  entity_inst_t moninst[mpi_world];  // only care about first g_conf.num_mon of these.
+
+  if (mpi_rank < g_conf.num_mon) {
+    rank.my_rank = mpi_rank;  
+    rank.start_rank();   // bind and listen
+
+    moninst[mpi_rank].rank = mpi_rank;
+    moninst[mpi_rank].addr = rank.get_listen_addr();
+
+    //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl;
+  } 
+
+  MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR,
+              moninst, sizeof(entity_inst_t), MPI_CHAR,
+              0, MPI_COMM_WORLD);
+
+  if (mpi_rank == 0) {
+    rank.start_namer();
+    
+    for (int i=0; i<g_conf.num_mon; i++) {
+      cerr << "mon" << i << " is at " << moninst[i] << endl;
+      monmap->mon_inst[i] = moninst[i];
+      if (i) rank.namer->manual_insert_inst(monmap->get_inst(i));
+    }
+  }
+
+
+  // distribute monmap
+  bufferlist bl;
+  if (mpi_rank == 0) {
+    monmap->encode(bl);
+    
+    int fd = ::open(".ceph_monmap", O_WRONLY|O_CREAT);
+    ::write(fd, (void*)bl.c_str(), bl.length());
+    ::fchmod(fd, 0755);
+    ::close(fd);
+
+  } else {
+    int l = g_conf.num_mon * 1000;   // nice'n big.
+    bufferptr bp(l); 
+    bl.append(bp);
+  }
+  
+  MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR,
+            0, MPI_COMM_WORLD);
+
+  if (mpi_rank > 0) {
+    monmap->decode(bl);
+    rank.set_namer(monmap->get_inst(0).addr);
+  }
+
+  if (mpi_rank >= g_conf.num_mon) {
+    rank.start_rank();
+  }
+
+  // wait for everyone!
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  return pair<int,int>(mpi_rank, mpi_world);
+}
+
+utime_t tick_start;
+int tick_count = 0;
+
+class C_Tick : public Context {
+public:
+  void finish(int) {
+    utime_t now = g_clock.now() - tick_start;
+    dout(0) << "tick +" << g_conf.tick << " -> " << now << "  (" << tick_count << ")" << endl;
+    tick_count += g_conf.tick;
+    utime_t next = tick_start;
+    next.sec_ref() += tick_count;
+    g_timer.add_event_at(next, new C_Tick);
+  }
+};
+
+class C_Die : public Context {
+public:
+  void finish(int) {
+    cerr << "die" << endl;
+    exit(1);
+  }
+};
+
+class C_Debug : public Context {
+  public:
+  void finish(int) {
+    int size = &g_conf.debug_after - &g_conf.debug;
+    memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+    dout(0) << "debug_after flipping debug settings" << endl;
+  }
+};
+
+
+int main(int argc, char **argv) 
+{
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+
+  map<int,int> kill_osd_after;
+  if (1) {
+    vector<char*> nargs;
+    for (unsigned i=0; i<args.size(); i++) {
+      if (strcmp(args[i],"--kill_osd_after") == 0) {
+        int o = atoi(args[++i]);
+        int w = atoi(args[++i]);
+        kill_osd_after[o] = w;
+      }
+      else {
+        nargs.push_back( args[i] );
+      }
+    }
+    args.swap(nargs);
+  }
+
+  parse_config_options(args);
+  parse_syn_options(args);
+
+  if (g_conf.kill_after) 
+    g_timer.add_event_after(g_conf.kill_after, new C_Die);
+  if (g_conf.debug_after) 
+    g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+  if (g_conf.tick) {
+    tick_start = g_clock.now();
+    g_timer.add_event_after(g_conf.tick, new C_Tick);
+  }
+
+  vector<char*> nargs;
+  for (unsigned i=0; i<args.size(); i++) {
+    //cout << "a " << args[i] << endl;
+    // unknown arg, pass it on.
+    nargs.push_back(args[i]);
+  }
+
+  args = nargs;
+  if (!args.empty()) {
+    for (unsigned i=0; i<args.size(); i++)
+      cerr << "stray arg " << args[i] << endl;
+  }
+  assert(args.empty());
+
+
+  // start up messenger via MPI
+  MonMap *monmap = new MonMap(g_conf.num_mon);
+  pair<int,int> mpiwho = mpi_bootstrap_new(argc, argv, monmap);
+  int myrank = mpiwho.first;
+  int world = mpiwho.second;
+
+  int need = 0;
+  if (g_conf.ms_skip_rank0) need++;
+  need += NUMMDS;
+  if (g_conf.ms_stripe_osds)
+    need++;
+  else
+    need += NUMOSD;
+  if (NUMCLIENT) {
+    if (!g_conf.ms_overlay_clients)
+      need += 1;
+  }
+  assert(need <= world);
+
+  if (myrank == 0)
+    cerr << "nummds " << NUMMDS << "  numosd " << NUMOSD << "  numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl;
+  
+
+  char hostname[100];
+  gethostname(hostname,100);
+  int pid = getpid();
+
+  int started = 0;
+
+  //if (myrank == 0) g_conf.debug = 20;
+  
+  // create mon
+  if (myrank < g_conf.num_mon) {
+    Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap);
+    mon->init();
+  }
+
+  
+  // wait for monitors to start.
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  // okay, home free!
+  MPI_Finalize();
+
+
+  // create mds
+  map<int,MDS*> mds;
+  map<int,OSD*> mdsosd;
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != g_conf.ms_skip_rank0+i) continue;
+    Messenger *m = rank.register_entity(MSG_ADDR_MDS(i));
+    cerr << "mds" << i << " on tcprank " << rank.my_rank << " " << hostname << "." << pid << endl;
+    mds[i] = new MDS(i, m, monmap);
+    mds[i]->init();
+    started++;
+
+    if (g_conf.mds_local_osd) {
+      mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap);
+      mdsosd[i]->init();                                                    
+    }
+  }
+  
+  // create osd
+  map<int,OSD*> osd;
+  int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0;  // assumes 0 clients, if we stripe.
+  int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1;
+  for (int i=0; i<NUMOSD; i++) {
+    if (g_conf.ms_stripe_osds) {
+      if (myrank != g_conf.ms_skip_rank0+NUMMDS + i / osds_per_node) continue;
+    } else {
+      if (myrank != g_conf.ms_skip_rank0+NUMMDS + i) continue;
+    }
+
+    if (kill_osd_after.count(i))
+      g_timer.add_event_after(kill_osd_after[i], new C_Die);
+
+    Messenger *m = rank.register_entity(MSG_ADDR_OSD(i));
+    cerr << "osd" << i << " on tcprank " << rank.my_rank <<  " " << hostname << "." << pid << endl;
+    osd[i] = new OSD(i, m, monmap);
+    osd[i]->init();
+    started++;
+  }
+  
+  if (g_conf.ms_overlay_clients) sleep(5);
+
+  // create client
+  int skip_osd = NUMOSD;
+  if (g_conf.ms_overlay_clients) 
+    skip_osd = 0;        // put clients with osds too!
+  int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0;
+  int clients_per_node = 1;
+  if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1;
+  set<int> clientlist;
+  map<int,Client *> client;//[NUMCLIENT];
+  map<int,SyntheticClient *> syn;//[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    //if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
+    if (myrank != g_conf.ms_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
+    clientlist.insert(i);
+    client[i] = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap);
+
+    // logger?
+    if (client_logger == 0) {
+      char s[80];
+      sprintf(s,"clnode.%d", myrank);
+      client_logger = new Logger(s, &client_logtype);
+
+      client_logtype.add_inc("lsum");
+      client_logtype.add_inc("lnum");
+      client_logtype.add_inc("lwsum");
+      client_logtype.add_inc("lwnum");
+      client_logtype.add_inc("lrsum");
+      client_logtype.add_inc("lrnum");
+      client_logtype.add_inc("trsum");
+      client_logtype.add_inc("trnum");
+      client_logtype.add_inc("wrlsum");
+      client_logtype.add_inc("wrlnum");
+      client_logtype.add_inc("lstatsum");
+      client_logtype.add_inc("lstatnum");
+      client_logtype.add_inc("ldirsum");
+      client_logtype.add_inc("ldirnum");
+      client_logtype.add_inc("readdir");
+      client_logtype.add_inc("stat");
+    }
+
+    client[i]->init();
+    started++;
+
+    syn[i] = new SyntheticClient(client[i]);
+  }
+
+  if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
+
+  int nclients = 0;
+  for (set<int>::iterator it = clientlist.begin();
+       it != clientlist.end();
+       it++) {
+    int i = *it;
+
+    //cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
+    client[i]->mount();
+    syn[i]->start_thread();
+    
+    nclients++;
+  }
+  if (nclients) {
+    cerr << nclients << " clients on tcprank " << rank.my_rank << " " << hostname << "." << pid << endl;
+  }
+
+  for (set<int>::iterator it = clientlist.begin();
+       it != clientlist.end();
+       it++) {
+    int i = *it;
+
+    //      cout << "waiting for synthetic client" << i << " to finish" << endl;
+    syn[i]->join_thread();
+    delete syn[i];
+    
+    client[i]->unmount();
+    //cout << "client" << i << " unmounted" << endl;
+    client[i]->shutdown();
+
+    delete client[i];
+  }
+  
+
+  if (myrank && !started) {
+    //dout(1) << "IDLE" << endl;
+    cerr << "idle on tcprank " << rank.my_rank << " " << hostname << "." << pid << endl; 
+    //rank.stop_rank();
+  } 
+
+  // wait for everything to finish
+  rank.wait();
+
+  if (started) cerr << "newsyn finishing" << endl;
+
+  return 0;  // whatever, cleanup hangs sometimes (stopping ebofs threads?).
+
+
+  // cleanup
+  for (map<int,MDS*>::iterator i = mds.begin(); i != mds.end(); i++)
+    delete i->second;
+  for (map<int,OSD*>::iterator i = mdsosd.begin(); i != mdsosd.end(); i++)
+    delete i->second;
+  for (map<int,OSD*>::iterator i = osd.begin(); i != osd.end(); i++)
+    delete i->second;
+  /*
+  for (map<int,Client*>::iterator i = client.begin(); i != client.end(); i++)
+    delete i->second;
+  for (map<int,SyntheticClient*>::iterator i = syn.begin(); i != syn.end(); i++)
+    delete i->second;
+  */
+  /*
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+    delete client[i];
+  }
+  */
+
+  
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/osd/Ager.cc b/branches/sage/cephmds2/osd/Ager.cc
new file mode 100644
index 0000000000000..8e21a5b871e0b
--- /dev/null
+++ b/branches/sage/cephmds2/osd/Ager.cc
@@ -0,0 +1,326 @@
+
+#include "include/types.h"
+
+#include "Ager.h"
+#include "ObjectStore.h"
+
+#include "config.h"
+#include "common/Clock.h"
+
+// ick
+#include "ebofs/Ebofs.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+int myrand() 
+{
+  if (0) 
+    return rand();
+  else {
+    static int n = 0;
+    srand(n++);
+    return rand();
+  }
+}
+
+
+object_t Ager::age_get_oid() {
+  if (!age_free_oids.empty()) {
+    object_t o = age_free_oids.front();
+    age_free_oids.pop_front();
+    return o;
+  }
+  object_t last = age_cur_oid;
+  ++age_cur_oid.bno;
+  return last;
+}
+
+ssize_t Ager::age_pick_size() {
+  ssize_t max = file_size_distn.sample() * 1024;
+  return max/2 + (myrand() % 100) * max/200 + 1;
+}
+
+bool start_debug = false;
+
+__uint64_t Ager::age_fill(float pc, utime_t until) {
+  int max = 1024*1024;
+  bufferptr bp(max);
+  bp.zero();
+  bufferlist bl;
+  bl.push_back(bp);
+  __uint64_t wrote = 0;
+  while (1) {
+    if (g_clock.now() > until) break;
+    
+    struct statfs st;
+    store->statfs(&st);
+    float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
+    float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks);  // to write to
+    //float a = (float)(st.f_bfree) / (float)st.f_blocks;
+    //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl;
+    if (free >= pc) {
+      dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl;
+      break;
+    }
+
+    // make sure we can write to it..
+    if (avail > .98 ||
+        avail - free > .02) 
+      store->sync();
+
+    object_t oid = age_get_oid();
+    
+    int b = myrand() % 10;
+    age_objects[b].push_back(oid);
+    
+    ssize_t s = age_pick_size();
+    wrote += (s + 4095) / 4096;
+
+
+
+
+    dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl;
+    
+
+    if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { 
+      /*
+
+
+      1005700
+?
+1005000
+1005700
+      1005710
+      1005725ULL
+      1005750ULL
+      1005800
+      1006000
+
+//  99  1000500 ? 1000750 1006000
+*/
+      g_conf.debug_ebofs = 30;
+      g_conf.ebofs_verify = true;      
+    }
+
+    off_t off = 0;
+    while (s) {
+      ssize_t t = MIN(s, max);
+      bufferlist sbl;
+      sbl.substr_of(bl, 0, t);
+      store->write(oid, off, t, sbl, false);
+      off += t;
+      s -= t;
+    }
+    oid.bno++;
+  }
+
+  return wrote*4; // KB
+}
+
+void Ager::age_empty(float pc) {
+  int nper = 20;
+  int n = nper;
+
+  //g_conf.ebofs_verify = true;
+
+  while (1) {
+    struct statfs st;
+    store->statfs(&st);
+    float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
+    float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks);  // to write to
+    dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl;
+    if (free <= pc) {
+      dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl;
+      break;
+    }
+    
+    int b = myrand() % 10;
+    n--;
+    if (n == 0 || age_objects[b].empty()) {
+      dout(2) << "age_empty sync" << endl;
+      //sync();
+      //sync();
+      n = nper;
+      continue;
+    }
+    object_t oid = age_objects[b].front();
+    age_objects[b].pop_front();
+    
+    dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl;
+    
+    store->remove(oid);
+    age_free_oids.push_back(oid);
+  }
+
+  g_conf.ebofs_verify = false;
+}
+
+void pfrag(__uint64_t written, ObjectStore::FragmentationStat &st)
+{
+  cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." 
+       << endl;
+  cout << written
+       << "\t" << st.total
+       << "\t" << st.num_extent
+       << "\t" << st.avg_extent
+       << "\t" << st.avg_extent_per_object
+       << "\t" << st.avg_extent_jump 
+       << "\t" << st.total_free
+       << "\t" << st.num_free_extent
+       << "\t" << st.avg_free_extent;
+    
+  int n = st.num_extent;
+  for (__uint64_t i=1; i <= 30; i += 1) {
+    cout << "\t" << st.extent_dist[i];
+    cout << "\t" << st.extent_dist_sum[i];
+    //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0);
+    n -= st.extent_dist[i];
+    if (n == 0) break;
+  }
+  cout << endl;
+}
+
+
+void Ager::age(int time,
+               float high_water,    // fill to this %
+               float low_water,     // then empty to this %
+               int count,         // this many times
+               float final_water,   // and end here ( <= low_water)
+               int fake_size_mb) { 
+
+  store->_fake_writes(true);
+  srand(0);
+
+  utime_t start = g_clock.now();
+  utime_t until = start;
+  until.sec_ref() += time;
+  
+  int elapsed = 0;
+  int freelist_inc = 60;
+  utime_t nextfl = start;
+  nextfl.sec_ref() += freelist_inc;
+
+  while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+  
+  if (fake_size_mb) {
+    int fake_bl = fake_size_mb * 256;
+    struct statfs st;
+    store->statfs(&st);
+    float f = (float)fake_bl / (float)st.f_blocks;
+    high_water = (float)high_water * f;
+    low_water = (float)low_water * f;
+    final_water = (float)final_water * f;
+    dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl;
+  }
+  
+  // init size distn (once)
+  if (!did_distn) {
+    did_distn = true;
+    age_cur_oid = object_t(0,1);
+    file_size_distn.add(1, 19.0758125+0.65434375);
+    file_size_distn.add(512, 35.6566);
+    file_size_distn.add(1024, 27.7271875);
+    file_size_distn.add(2*1024, 16.63503125);
+    //file_size_distn.add(4*1024, 106.82384375);
+    //file_size_distn.add(8*1024, 81.493375);
+    //file_size_distn.add(16*1024, 14.13553125);
+    //file_size_distn.add(32*1024, 2.176);
+    //file_size_distn.add(256*1024, 0.655938);
+    //file_size_distn.add(512*1024, 0.1480625);
+    //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+    file_size_distn.normalize();
+  }
+  
+  // clear
+  for (int i=0; i<10; i++)
+    age_objects[i].clear();
+  
+  ObjectStore::FragmentationStat st;
+
+  __uint64_t wrote = 0;
+
+  for (int c=1; c<=count; c++) {
+    if (g_clock.now() > until) break;
+    
+    //if (c == 7) start_debug = true;
+    
+    dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl;
+    __uint64_t w = age_fill(high_water, until);
+    //dout(1) << "age wrote " << w << endl;
+    wrote += w;
+    //store->sync();
+    //store->_get_frag_stat(st);
+    //pfrag(st);
+
+
+    if (c == count) {
+      dout(1) << "#age final empty to " << final_water << endl;
+      age_empty(final_water);    
+    } else {
+      dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl;
+      age_empty(low_water);
+    }
+    //store->sync();
+    //store->sync();
+
+    // show frag state
+    store->_get_frag_stat(st);
+    pfrag(wrote / (1024ULL*1024ULL) ,  // GB
+          st);
+
+    // dump freelist?
+    if (g_clock.now() > nextfl) {
+      elapsed += freelist_inc;
+      save_freelist(elapsed);
+      nextfl.sec_ref() += freelist_inc;
+    }
+  }
+
+  // dump the freelist
+  save_freelist(0);
+  exit(0);   // hack
+
+  // ok!
+  store->_fake_writes(false);
+  store->sync();
+  store->sync();
+  dout(1) << "age finished" << endl;
+}  
+
+
+void Ager::load_freelist()
+{
+  dout(1) << "load_freelist" << endl;
+
+  struct stat st;
+  
+  int r = ::stat("ebofs.freelist", &st);
+  assert(r == 0);
+
+  bufferptr bp(st.st_size);
+  bufferlist bl;
+  bl.push_back(bp);
+  int fd = ::open("ebofs.freelist", O_RDONLY);
+  ::read(fd, bl.c_str(), st.st_size);
+  ::close(fd);
+
+  ((Ebofs*)store)->_import_freelist(bl);
+  store->sync();
+  store->sync();
+}
+
+void Ager::save_freelist(int el)
+{
+  dout(1) << "save_freelist " << el << endl;
+  char s[100];
+  sprintf(s, "ebofs.freelist.%d", el);
+  bufferlist bl;
+  ((Ebofs*)store)->_export_freelist(bl);
+  ::unlink(s);
+  int fd = ::open(s, O_CREAT|O_WRONLY);
+  ::fchmod(fd, 0644);
+  ::write(fd, bl.c_str(), bl.length());
+  ::close(fd);
+}
diff --git a/branches/sage/cephmds2/osd/Ager.h b/branches/sage/cephmds2/osd/Ager.h
new file mode 100644
index 0000000000000..864c23fce8e14
--- /dev/null
+++ b/branches/sage/cephmds2/osd/Ager.h
@@ -0,0 +1,42 @@
+#ifndef __AGER_H
+#define __AGER_H
+
+#include "include/types.h"
+#include "include/Distribution.h"
+#include "ObjectStore.h"
+#include "common/Clock.h"
+
+#include <list>
+#include <vector>
+using namespace std;
+
+class Ager {
+  ObjectStore *store;
+
+ private:
+  list<object_t>           age_free_oids;
+  object_t                 age_cur_oid;
+  vector< list<object_t> > age_objects;
+  Distribution file_size_distn; //kb
+  bool         did_distn;
+
+  void age_empty(float pc);
+  __uint64_t age_fill(float pc, utime_t until);
+  ssize_t age_pick_size();
+  object_t age_get_oid();
+
+ public:
+  Ager(ObjectStore *s) : store(s), did_distn(false) {} 
+
+  void age(int time,
+           float high_water,    // fill to this %
+          float low_water,     // then empty to this %
+          int count,         // this many times
+          float final_water,   // and end here ( <= low_water)
+          int fake_size_mb=0);
+
+  void save_freelist(int);
+  void load_freelist();
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/BDBMap.h b/branches/sage/cephmds2/osd/BDBMap.h
new file mode 100644
index 0000000000000..203a4ca9dce8f
--- /dev/null
+++ b/branches/sage/cephmds2/osd/BDBMap.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __BERKELEYDB_H
+#define __BERKELEYDB_H
+
+#include <db.h>
+#include <unistd.h>
+
+#include <list>
+using namespace std;
+
+
+template<typename K, typename D>
+class BDBMap {
+ private:
+  DB *dbp;
+  
+ public:
+  BDBMap() : dbp(0) {}
+  ~BDBMap() {
+    close();
+  }
+
+  bool is_open() { return dbp ? true:false; }
+
+  // open/close
+  int open(const char *fn) {
+    //cout << "open " << fn << endl;
+
+    int r;
+    if ((r = db_create(&dbp, NULL, 0)) != 0) {
+      cerr << "db_create: " << db_strerror(r) << endl;
+      assert(0);
+    }
+
+    dbp->set_errfile(dbp, stderr);
+    dbp->set_errpfx(dbp, "bdbmap");
+
+    r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644);
+    if (r != 0) {
+      dbp->err(dbp, r, "%s", fn);
+    }
+    assert(r == 0);
+    return 0;
+  }
+  void close() {
+    if (dbp) {
+      dbp->close(dbp,0);
+      dbp = 0;
+    }
+  }
+  void remove(const char *fn) {
+    if (!dbp) open(fn);
+    if (dbp) {
+      dbp->remove(dbp, fn, 0, 0);
+      dbp = 0;
+    } else {
+      ::unlink(fn);
+    }
+  }
+  
+  // accessors
+  int put(K key,
+          D data) {
+    DBT k;
+    memset(&k, 0, sizeof(k)); 
+    k.data = &key;
+    k.size = sizeof(K);
+    DBT d;
+    memset(&d, 0, sizeof(d));
+    d.data = &data;
+    d.size = sizeof(data);
+    return dbp->put(dbp, NULL, &k, &d, 0);
+  }
+
+  int get(K key,
+          D& data) {
+    DBT k;
+    memset(&k, 0, sizeof(k)); 
+    k.data = &key;
+    k.size = sizeof(key);
+    DBT d;
+    memset(&d, 0, sizeof(d));
+    d.data = &data;
+    d.size = sizeof(data);
+    int r = dbp->get(dbp, NULL, &k, &d, 0);
+    return r;
+  }
+
+  int del(K key) {
+    DBT k;
+    memset(&k, 0, sizeof(k)); 
+    k.data = &key;
+    k.size = sizeof(key);
+    return dbp->del(dbp, NULL, &k, 0);
+  }
+
+  int list_keys(list<K>& ls) {
+    DBC *cursor = 0;
+    int r = dbp->cursor(dbp, NULL, &cursor, 0);
+    assert(r == 0);
+
+    DBT k,d;
+    memset(&k, 0, sizeof(k));
+    memset(&d, 0, sizeof(d));
+
+    while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) {
+      K key;
+      assert(k.size == sizeof(key));
+      memcpy(&key, k.data, k.size);
+      ls.push_back(key);
+    }
+    if (r != DB_NOTFOUND) {
+      dbp->err(dbp, r, "DBcursor->get");
+      assert(r == DB_NOTFOUND);
+    }
+
+    cursor->c_close(cursor);
+    return 0;
+  }
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/Fake.h b/branches/sage/cephmds2/osd/Fake.h
new file mode 100644
index 0000000000000..01fa4afcf3cb8
--- /dev/null
+++ b/branches/sage/cephmds2/osd/Fake.h
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __FAKE_H
+#define __FAKE_H
+
+#include "include/types.h"
+
+#include <list>
+#include <set>
+#include <ext/hash_map>
+using namespace std;
+using namespace __gnu_cxx;
+
+class FakeStoreCollections {
+ private:
+  Mutex faker_lock;
+  ObjectStore *store;
+  hash_map<coll_t, set<object_t> > fakecollections;
+
+ public:
+  FakeStoreCollections(ObjectStore *s) : store(s) {}
+
+  // faked collections
+  int list_collections(list<coll_t>& ls) {
+    faker_lock.Lock();
+    int r = 0;
+    for (hash_map< coll_t, set<object_t> >::iterator p = fakecollections.begin();
+         p != fakecollections.end();
+         p++) {
+      r++;
+      ls.push_back(p->first);
+    }
+    faker_lock.Unlock();
+    return r;
+  }
+
+  int create_collection(coll_t c,
+                        Context *onsafe=0) {
+    faker_lock.Lock();
+    fakecollections[c].size();
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return 0;
+  }
+
+  int destroy_collection(coll_t c,
+                         Context *onsafe=0) {
+    int r = 0;
+    faker_lock.Lock();
+    if (fakecollections.count(c)) {
+      fakecollections.erase(c);
+      //fakecattr.erase(c);
+      if (onsafe) store->sync(onsafe);
+    } else 
+      r = -1;
+    faker_lock.Unlock();
+    return r;
+  }
+
+  int collection_stat(coll_t c, struct stat *st) {
+    return collection_exists(c) ? 0:-1;
+  }
+
+  bool collection_exists(coll_t c) {
+    faker_lock.Lock();
+    int r = fakecollections.count(c);
+    faker_lock.Unlock();
+    return r;
+  }
+
+  int collection_add(coll_t c, object_t o,
+                     Context *onsafe=0) {
+    faker_lock.Lock();
+    fakecollections[c].insert(o);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return 0;
+  }
+
+  int collection_remove(coll_t c, object_t o,
+                        Context *onsafe=0) {
+    faker_lock.Lock();
+    fakecollections[c].erase(o);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return 0;
+  }
+
+  int collection_list(coll_t c, list<object_t>& o) {
+    faker_lock.Lock();
+    int r = 0;
+    for (set<object_t>::iterator p = fakecollections[c].begin();
+         p != fakecollections[c].end();
+         p++) {
+      o.push_back(*p);
+      r++;
+    }
+    faker_lock.Unlock();
+    return r;
+  }
+
+};
+
+class FakeStoreAttrs {
+ private:
+  
+  class FakeAttrSet {
+  public:
+    map<string, bufferptr> attrs;
+    
+    int getattr(const char *name, void *value, size_t size) {
+      string n = name;
+      if (attrs.count(n)) {
+        size_t l = MIN( attrs[n].length(), size );
+        bufferlist bl;
+        bl.append(attrs[n]);
+        bl.copy(0, l, (char*)value);
+        return l;
+      }
+      return -1;
+    }
+    int getattrs(map<string,bufferptr>& aset) {
+      aset = attrs;
+      return 0;
+    }
+    int setattrs(map<string,bufferptr>& aset) {
+      attrs = aset;
+      return 0;
+    }
+    
+    int setattr(const char *name, const void *value, size_t size) {
+      string n = name;
+      bufferptr bp = buffer::copy((char*)value, size);
+      attrs[n] = bp;
+      return 0;
+    }
+    
+    int listattr(char *attrs, size_t size) {
+      assert(0);
+      return 0;
+    }
+
+    int rmattr(const char *name) {
+      string n = name;
+      attrs.erase(n);
+      return 0;
+    }
+    
+    bool empty() { return attrs.empty(); }
+  };
+
+  Mutex faker_lock;
+  ObjectStore *store;
+  hash_map<object_t, FakeAttrSet> fakeoattrs;
+  hash_map<coll_t, FakeAttrSet> fakecattrs;
+
+ public:
+  FakeStoreAttrs(ObjectStore *s) : store(s) {}
+
+  int setattr(object_t oid, const char *name,
+              const void *value, size_t size,
+              Context *onsafe=0) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].setattr(name, value, size);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return r;
+  }
+  int setattrs(object_t oid, map<string,bufferptr>& aset) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].setattrs(aset);
+    faker_lock.Unlock();
+    return r;
+  }
+  int getattr(object_t oid, const char *name,
+              void *value, size_t size) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].getattr(name, value, size);
+    faker_lock.Unlock();
+    return r;
+  }
+  int getattrs(object_t oid, map<string,bufferptr>& aset) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].getattrs(aset);
+    faker_lock.Unlock();
+    return r;
+  }
+  int rmattr(object_t oid, const char *name,
+             Context *onsafe=0) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].rmattr(name);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return r;
+  }
+
+  int listattr(object_t oid, char *attrs, size_t size) {
+    faker_lock.Lock();
+    int r = fakeoattrs[oid].listattr(attrs,size);
+    faker_lock.Unlock();
+    return r;
+  }
+
+  int collection_setattr(coll_t c, const char *name,
+                         void *value, size_t size,
+                         Context *onsafe=0) {
+    faker_lock.Lock();
+    int r = fakecattrs[c].setattr(name, value, size);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return r;
+  }
+  int collection_rmattr(coll_t c, const char *name,
+                        Context *onsafe=0) {
+    faker_lock.Lock();
+    int r = fakecattrs[c].rmattr(name);
+    if (onsafe) store->sync(onsafe);
+    faker_lock.Unlock();
+    return r;
+  }
+  int collection_getattr(coll_t c, const char *name,
+                         void *value, size_t size) {
+    faker_lock.Lock();
+    int r = fakecattrs[c].getattr(name, value, size);
+    faker_lock.Unlock();
+    return r;
+  }
+  int collection_listattr(coll_t c, char *attrs, size_t size) {
+    faker_lock.Lock();
+    int r = fakecattrs[c].listattr(attrs,size);
+    faker_lock.Unlock();
+    return r;
+  }
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/FakeStore.cc b/branches/sage/cephmds2/osd/FakeStore.cc
new file mode 100644
index 0000000000000..c2f573a81038f
--- /dev/null
+++ b/branches/sage/cephmds2/osd/FakeStore.cc
@@ -0,0 +1,364 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "FakeStore.h"
+#include "include/types.h"
+
+#include "common/Timer.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+//#include <sys/xattr.h>
+//#include <sys/vfs.h>
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore "
+
+#include "include/buffer.h"
+
+#include <map>
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+// crap-a-crap hash
+#define HASH_DIRS       0x80
+#define HASH_MASK       0x7f
+// end crap hash
+
+
+
+
+
+
+
+int FakeStore::mount() 
+{
+  if (g_conf.fakestore_dev) {
+    dout(0) << "mounting" << endl;
+    char cmd[100];
+    sprintf(cmd,"mount %s", g_conf.fakestore_dev);
+    system(cmd);
+  }
+
+  string mydir;
+  get_dir(mydir);
+
+  dout(5) << "init with basedir " << mydir << endl;
+
+  // make sure global base dir exists
+  struct stat st;
+  int r = ::stat(basedir.c_str(), &st);
+  if (r != 0) {
+    dout(1) << "unable to stat basedir " << basedir << ", r = " << r << endl;
+    return r;
+  }
+
+  // all okay.
+  return 0;
+}
+
+int FakeStore::umount() 
+{
+  dout(5) << "finalize" << endl;
+
+  if (g_conf.fakestore_dev) {
+    char cmd[100];
+    dout(0) << "umounting" << endl;
+    sprintf(cmd,"umount %s", g_conf.fakestore_dev);
+    system(cmd);
+  }
+
+  // nothing
+  return 0;
+}
+
+
+int FakeStore::statfs(struct statfs *buf)
+{
+  string mydir;
+  get_dir(mydir);
+  return ::statfs(mydir.c_str(), buf);
+}
+
+
+
+
+void FakeStore::get_dir(string& dir) {
+  char s[30];
+  sprintf(s, "%d", whoami);
+  dir = basedir + "/" + s;
+}
+void FakeStore::get_oname(object_t oid, string& fn) {
+  char s[100];
+  static hash<object_t> H;
+  sprintf(s, "%d/%02x/%016llx.%08x.%d", whoami, H(oid) & HASH_MASK, oid.ino, oid.bno, oid.rev);
+  fn = basedir + "/" + s;
+  //  dout(1) << "oname is " << fn << endl;
+}
+
+
+
+void FakeStore::wipe_dir(string mydir)
+{
+  DIR *dir = ::opendir(mydir.c_str());
+  if (dir) {
+    dout(10) << "wiping " << mydir << endl;
+    struct dirent *ent = 0;
+    
+    while ((ent = ::readdir(dir)) != 0) {
+      if (ent->d_name[0] == '.') continue;
+      dout(25) << "mkfs unlinking " << ent->d_name << endl;
+      string fn = mydir + "/" + ent->d_name;
+      ::unlink(fn.c_str());
+    }    
+    
+    ::closedir(dir);
+  } else {
+    dout(1) << "mkfs couldn't read dir " << mydir << endl;
+  }
+}
+
+int FakeStore::mkfs()
+{
+  if (g_conf.fakestore_dev) {
+    dout(0) << "mounting" << endl;
+    char cmd[100];
+    sprintf(cmd,"mount %s", g_conf.fakestore_dev);
+    system(cmd);
+  }
+
+
+  int r = 0;
+  struct stat st;
+  string mydir;
+  get_dir(mydir);
+
+  dout(1) << "mkfs in " << mydir << endl;
+
+
+  // make sure my dir exists
+  r = ::stat(mydir.c_str(), &st);
+  if (r != 0) {
+    dout(10) << "creating " << mydir << endl;
+    mkdir(mydir.c_str(), 0755);
+    r = ::stat(mydir.c_str(), &st);
+    if (r != 0) {
+      dout(1) << "couldnt create dir, r = " << r << endl;
+      return r;
+    }
+  }
+  else wipe_dir(mydir);
+
+  // hashed bits too
+  for (int i=0; i<HASH_DIRS; i++) {
+    char s[4];
+    sprintf(s, "%02x", i);
+    string subdir = mydir + "/" + s;
+    r = ::stat(subdir.c_str(), &st);
+    if (r != 0) {
+      dout(2) << " creating " << subdir << endl;
+      ::mkdir(subdir.c_str(), 0755);
+      r = ::stat(subdir.c_str(), &st);
+      if (r != 0) {
+        dout(1) << "couldnt create subdir, r = " << r << endl;
+        return r;
+      }
+    }
+    else
+      wipe_dir( subdir );
+  }
+  
+  if (g_conf.fakestore_dev) {
+    char cmd[100];
+    dout(0) << "umounting" << endl;
+    sprintf(cmd,"umount %s", g_conf.fakestore_dev);
+    system(cmd);
+  }
+
+  dout(1) << "mkfs done in " << mydir << endl;
+
+  return r;
+}
+
+
+
+bool FakeStore::exists(object_t oid)
+{
+  struct stat st;
+  if (stat(oid, &st) == 0)
+    return true;
+  else 
+    return false;
+}
+
+  
+int FakeStore::stat(object_t oid,
+                    struct stat *st)
+{
+  dout(20) << "stat " << oid << endl;
+  string fn;
+  get_oname(oid,fn);
+  int r = ::stat(fn.c_str(), st);
+  return r;
+}
+ 
+ 
+
+int FakeStore::remove(object_t oid, Context *onsafe) 
+{
+  dout(20) << "remove " << oid << endl;
+  string fn;
+  get_oname(oid,fn);
+  int r = ::unlink(fn.c_str());
+  if (onsafe) sync(onsafe);
+  return r;
+}
+
+int FakeStore::truncate(object_t oid, off_t size, Context *onsafe)
+{
+  dout(20) << "truncate " << oid << " size " << size << endl;
+
+  string fn;
+  get_oname(oid,fn);
+  int r = ::truncate(fn.c_str(), size);
+  if (onsafe) sync(onsafe);
+  return r;
+}
+
+int FakeStore::read(object_t oid, 
+                    off_t offset, size_t len,
+                    bufferlist& bl) {
+  dout(20) << "read " << oid << " len " << len << " off " << offset << endl;
+
+  string fn;
+  get_oname(oid,fn);
+  
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0) {
+    dout(10) << "read couldn't open " << fn.c_str() << " errno " << errno << " " << strerror(errno) << endl;
+    return fd;
+  }
+  ::flock(fd, LOCK_EX);    // lock for safety
+  
+  off_t actual = lseek(fd, offset, SEEK_SET);
+  size_t got = 0;
+
+  if (len == 0) {
+    struct stat st;
+    fstat(fd, &st);
+    len = st.st_size;
+  }
+
+  if (actual == offset) {
+    bufferptr bptr(len);  // prealloc space for entire read
+    got = ::read(fd, bptr.c_str(), len);
+    bptr.set_length(got);   // properly size the buffer
+    if (got > 0) bl.push_back( bptr );   // put it in the target bufferlist
+  }
+  ::flock(fd, LOCK_UN);
+  ::close(fd);
+  return got;
+}
+
+
+int FakeStore::write(object_t oid, 
+                     off_t offset, size_t len,
+                     bufferlist& bl, 
+                     Context *onsafe)
+{
+  dout(20) << "write " << oid << " len " << len << " off " << offset << endl;
+
+  string fn;
+  get_oname(oid,fn);
+  
+  ::mknod(fn.c_str(), 0644, 0);  // in case it doesn't exist yet.
+
+  int flags = O_WRONLY;//|O_CREAT;
+  int fd = ::open(fn.c_str(), flags);
+  if (fd < 0) {
+    dout(1) << "write couldn't open " << fn.c_str() << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl;
+    return fd;
+  }
+  ::flock(fd, LOCK_EX);    // lock for safety
+  //::fchmod(fd, 0664);
+  
+  // seek
+  off_t actual = lseek(fd, offset, SEEK_SET);
+  int did = 0;
+  assert(actual == offset);
+
+  // write buffers
+  for (list<bufferptr>::const_iterator it = bl.buffers().begin();
+       it != bl.buffers().end();
+       it++) {
+    int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
+    if (r > 0)
+      did += r;
+    else {
+      dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl;
+    }
+  }
+  
+  if (did < 0) {
+    dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl;
+  }
+
+  ::flock(fd, LOCK_UN);
+
+  // schedule sync
+  if (onsafe) sync(onsafe);
+
+  ::close(fd);
+  
+  return did;
+}
+
+
+class C_FakeSync : public Context {
+public:
+  Context *c;
+  int *n;
+  C_FakeSync(Context *c_, int *n_) : c(c_), n(n_) {
+    ++*n;
+  }
+  void finish(int r) {
+    c->finish(r);
+    --(*n);
+    //cout << "sync, " << *n << " still unsync" << endl;
+  }
+};
+
+void FakeStore::sync(Context *onsafe)
+{
+  if (g_conf.fakestore_fake_sync) {
+    g_timer.add_event_after((float)g_conf.fakestore_fake_sync,
+                            new C_FakeSync(onsafe, &unsync));
+    
+  } else {
+    assert(0); // der..no implemented anymore
+  }
+}
+
+
+
diff --git a/branches/sage/cephmds2/osd/FakeStore.h b/branches/sage/cephmds2/osd/FakeStore.h
new file mode 100644
index 0000000000000..eaa4126e84e46
--- /dev/null
+++ b/branches/sage/cephmds2/osd/FakeStore.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __FAKESTORE_H
+#define __FAKESTORE_H
+
+#include "ObjectStore.h"
+#include "common/ThreadPool.h"
+#include "common/Mutex.h"
+
+#include "Fake.h"
+//#include "FakeStoreBDBCollections.h"
+
+
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+// fake attributes in memory, if we need to.
+
+
+class FakeStore : public ObjectStore, 
+                  public FakeStoreAttrs,
+                  public FakeStoreCollections {
+  string basedir;
+  int whoami;
+  
+  int unsync;
+
+  Mutex lock;
+
+  // fns
+  void get_dir(string& dir);
+  void get_oname(object_t oid, string& fn);
+  void wipe_dir(string mydir);
+
+
+ public:
+  FakeStore(char *base, int whoami) : FakeStoreAttrs(this), FakeStoreCollections(this)
+  {
+    this->basedir = base;
+    this->whoami = whoami;
+    unsync = 0;
+  }
+
+
+  int mount();
+  int umount();
+  int mkfs();
+
+  int statfs(struct statfs *buf);
+
+  // ------------------
+  // objects
+  int pick_object_revision_lt(object_t& oid) {
+    return 0;
+  }
+  bool exists(object_t oid);
+  int stat(object_t oid, struct stat *st);
+  int remove(object_t oid, Context *onsafe);
+  int truncate(object_t oid, off_t size, Context *onsafe);
+  int read(object_t oid, 
+           off_t offset, size_t len,
+           bufferlist& bl);
+  int write(object_t oid, 
+            off_t offset, size_t len,
+            bufferlist& bl, 
+            Context *onsafe);
+
+  void sync(Context *onsafe);
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h b/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h
new file mode 100644
index 0000000000000..97316d2642674
--- /dev/null
+++ b/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h
@@ -0,0 +1,168 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __FAKESTOREBDBCOLLECTIONS_H
+#define __FAKESTOREBDBCOLLECTIONS_H
+
+#include "BDBMap.h"
+#include "ObjectStore.h"
+#include "common/Mutex.h"
+
+#define BDBHASH_DIRS       128LL
+#define BDBHASH_FUNC(x)    (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS)
+
+class FakeStoreBDBCollections {
+ private:
+  int whoami;
+  string basedir;
+
+  Mutex bdblock;
+
+  // collection dbs
+  BDBMap<coll_t, int>                 collections;
+  map<coll_t, BDBMap<object_t, int>*> collection_map;
+  
+  // dirs
+  void get_dir(string& dir) {
+    char s[30];
+    sprintf(s, "%d", whoami);
+    dir = basedir + "/" + s;
+  }
+  void get_collfn(coll_t c, string &fn) {
+    char s[100];
+    sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c);
+    fn = basedir + "/" + s;
+  }
+
+  void open_collections() {
+    string cfn;
+    get_dir(cfn);
+    cfn += "/collections";
+    collections.open(cfn.c_str());  
+    list<coll_t> ls;
+    collections.list_keys(ls);
+  }
+  void close_collections() {
+    if (collections.is_open())
+      collections.close();
+    
+    for (map<coll_t, BDBMap<object_t, int>*>::iterator it = collection_map.begin();
+         it != collection_map.end();
+         it++) {
+      it->second->close();
+    }
+    collection_map.clear();
+  }
+  
+  int open_collection(coll_t c) {
+    if (collection_map.count(c))
+      return 0;  // already open.
+    
+    string fn;
+    get_collfn(c,fn);
+    collection_map[c] = new BDBMap<coll_t,int>;
+    int r = collection_map[c]->open(fn.c_str());
+    if (r != 0)
+      collection_map.erase(c);  // failed
+    return r;
+  }
+  
+ public:
+  FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {}
+  ~FakeStoreBDBCollections() {
+    close_collections();
+  }
+
+  int list_collections(list<coll_t>& ls) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    ls.clear();
+    collections.list_keys(ls);
+    bdblock.Unlock();
+    return 0;
+  }
+  int create_collection(coll_t c) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    collections.put(c, 1);
+    open_collection(c);
+    bdblock.Unlock();
+    return 0;
+  }
+  int destroy_collection(coll_t c) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    collections.del(c);
+    
+    open_collection(c);
+    collection_map[c]->close();
+    
+    string fn;
+    get_collfn(c,fn);
+    collection_map[c]->remove(fn.c_str());
+    delete collection_map[c];
+    collection_map.erase(c);
+    bdblock.Unlock();
+    return 0;
+  }
+  int collection_stat(coll_t c, struct stat *st) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    string fn;
+    get_collfn(c,fn);
+    int r = ::stat(fn.c_str(), st);
+    bdblock.Unlock();
+    return r;
+  }
+  bool collection_exists(coll_t c) {
+    bdblock.Lock();
+    struct stat st;
+    int r = collection_stat(c, &st) == 0;
+    bdblock.Unlock();
+    return r;
+  }
+  int collection_add(coll_t c, object_t o) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    open_collection(c);
+    collection_map[c]->put(o,1);
+    bdblock.Unlock();
+    return 0;
+  }
+  int collection_remove(coll_t c, object_t o) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    open_collection(c);
+    collection_map[c]->del(o);
+    bdblock.Unlock();
+    return 0;
+  }
+  int collection_list(coll_t c, list<object_t>& o) {
+    bdblock.Lock();
+    if (!collections.is_open()) open_collections();
+    
+    open_collection(c);
+    collection_map[c]->list_keys(o);
+    bdblock.Unlock();
+    return 0;
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/OBFSStore.cc b/branches/sage/cephmds2/osd/OBFSStore.cc
new file mode 100644
index 0000000000000..e82c6f804721d
--- /dev/null
+++ b/branches/sage/cephmds2/osd/OBFSStore.cc
@@ -0,0 +1,244 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "OBFSStore.h"
+
+extern "C" {
+#include "../../uofs/uofs.h"
+}
+
+#include "common/Timer.h"
+
+#include "include/types.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs "
+
+OBFSStore::OBFSStore(int whoami, char *param, char *dev)
+{
+    this->whoami = whoami;
+    this->mounted = -1;
+    this->bdev_id = -1;
+    this->param[0] = 0;
+    this->dev[0] = 0;
+    if (dev)
+        strcpy(this->dev, dev);
+    if (param) 
+        strcpy(this->param, param);
+}
+
+int OBFSStore::mount(void)
+{
+    dout(0) << "OBFS init!" << endl;
+    if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) {
+        dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl;
+        return -1;
+    }
+
+    this->mkfs();
+    this->mounted = uofs_mount(this->bdev_id, 
+                               g_conf.uofs_cache_size,
+                               g_conf.uofs_min_flush_pages,
+                               this->whoami);
+    switch (this->mounted) {
+        case -1:
+            this->mkfs();
+            //retry to mount
+            dout(0) << "remount the OBFS" << endl;
+            this->mounted = uofs_mount(this->bdev_id, 
+                                       g_conf.uofs_cache_size,
+                                       g_conf.uofs_min_flush_pages,
+                                       this->whoami);
+            assert(this->mounted >= 0);
+            break;
+        case -2: 
+            //fsck
+            dout(0) << "Need fsck! Simply formatted for now!" << endl;
+            this->mkfs();
+            this->mounted = uofs_mount(this->bdev_id, 
+                                       g_conf.uofs_cache_size,
+                                       g_conf.uofs_min_flush_pages,
+                                       this->whoami);
+            assert(this->mounted >= 0);
+            break;
+        case 0:
+            //success
+            break;
+        default:
+            break;
+    }
+
+    if (this->mounted >= 0) 
+        dout(0) << "successfully mounted!" << endl;
+    else
+        dout(0) << "error in mounting obfsstore!" << endl;
+    
+    return 0;
+}
+
+int OBFSStore::mkfs(void)
+{
+  /*int    donode_size_byte     = 1024,
+        bd_ratio                = 10,
+        reg_size_mb             = 256,
+        sb_size_kb              = 4,
+        lb_size_kb              = 1024,
+      nr_hash_table_buckets   = 1023,
+      delay_allocation        = 1,
+      flush_interval        = 5;
+    FILE    *param;
+  */
+    
+    
+    if (this->mounted >= 0)
+      return 0;
+
+    dout(0) << "OBFS.mkfs!" << endl;
+    /*
+    if (strlen(this->param) > 0) {
+        param = fopen(this->param, "r");
+        if (param) {
+            //fscanf(param, "Block Device: %s\n", this->dev);
+            fscanf(param, "Donode Size: %d\n", &donode_size_byte);
+            fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio);
+            fscanf(param, "Region Size: %d MB\n", &reg_size_mb);
+            fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb);
+            fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb);
+            fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets);
+            fscanf(param, "Delayed Allocation: %d\n", &delay_allocation);
+        } else {
+            dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl;
+            dout(0) << "use default parameters" << endl;
+        }
+    } else
+        dout(0) << "use default parameters" << endl;
+    */
+
+    if (this->bdev_id <= 0)
+        if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) {
+            dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl;
+            return -1;
+        }
+    
+    dout(0) << "start formating!" << endl;
+
+    uofs_format(this->bdev_id,
+                g_conf.uofs_onode_size, 
+                g_conf.uofs_block_meta_ratio, 
+                g_conf.uofs_segment_size,
+                g_conf.uofs_small_block_size,
+                g_conf.uofs_large_block_size,
+                g_conf.uofs_nr_hash_buckets,
+                g_conf.uofs_delay_allocation, 
+                0,//g_conf.uofs_dev_force_size,
+                g_conf.uofs_flush_interval, 
+                0);
+
+    dout(0) << "formatting complete!" << endl;
+    return 0;
+}
+
+int OBFSStore::umount(void)
+{
+    uofs_shutdown();
+    close(this->bdev_id);
+
+    return 0;
+}
+
+int OBFSStore::statfs(struct statfs *sfs) 
+{
+  return 0;
+}
+
+bool OBFSStore::exists(object_t oid)
+{
+    //dout(0) << "calling function exists!" << endl;
+    return uofs_exist(oid);
+}
+
+int OBFSStore::stat(object_t oid, struct stat *st)
+{
+    dout(0) << "calling function stat!" << endl;
+    if (uofs_exist(oid)) return 0;
+    return -1;
+}
+
+int OBFSStore::remove(object_t oid)
+{
+    dout(0) << "calling remove function!" << endl;
+    return uofs_del(oid);
+}
+
+int OBFSStore::truncate(object_t oid, off_t size)
+{
+    dout(0) << "calling truncate function!" << endl;
+    //return uofs_truncate(oid, size);
+    return -1;
+}
+
+int OBFSStore::read(object_t oid, size_t len, 
+            off_t offset, bufferlist &bl)
+{
+    //dout(0) << "calling read function!" << endl;
+    //dout(0) << oid << " 0  " << len << " " << offset << " 100" << endl;
+
+  // FIXME: page-align this and we can avoid a memcpy...
+  bl.push_back(new buffer(len));
+  return uofs_read(oid, bl.c_str(), offset, len);
+}
+
+int OBFSStore::write(object_t oid, size_t len,
+                     off_t offset, bufferlist& bl, bool fsync)
+{
+    int ret = 0;
+    
+    //dout(0) << "calling write function!" << endl;
+    //if (whoami == 0)
+    //    dout(0) << oid << " 0  " << len << " " << offset << " 101" << endl;
+
+    for (list<bufferptr>::iterator p = bl.buffers().begin();
+         p != bl.buffers().end();
+         p++) {
+      ret += uofs_write(oid, (*p).c_str(), offset, len, 0);
+    }
+
+    if (fsync)
+        ret += uofs_sync(oid);
+    
+    return ret;
+}
+
+
+int OBFSStore::write(object_t oid, size_t len,
+             off_t offset, bufferlist& bl, Context *onflush)
+{
+  int r = write(oid, len, offset, bl, false);
+  g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush);
+  return r;
+}
diff --git a/branches/sage/cephmds2/osd/OBFSStore.h b/branches/sage/cephmds2/osd/OBFSStore.h
new file mode 100644
index 0000000000000..cb4a6afc815d7
--- /dev/null
+++ b/branches/sage/cephmds2/osd/OBFSStore.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef _OBFSSTORE_H_
+#define _OBFSSTORE_H_
+
+#include "ObjectStore.h"
+#include "Fake.h"
+
+class OBFSStore : public ObjectStore, 
+                  public FakeStoreAttrs, 
+                  public FakeStoreCollections {
+  int    whoami;
+  int    bdev_id;
+  int    mounted;
+  char    dev[128];
+  char    param[128];
+  
+ public:
+  OBFSStore(int whoami, char *param, char *dev);
+  
+  int mount(void);
+  int umount(void);
+  int mkfs(void);
+  
+  int statfs(struct statfs *);
+
+  bool exists(object_t oid);
+  int stat(object_t oid, struct stat *st);
+  
+  int remove(object_t oid);
+  int truncate(object_t oid, off_t size);
+  
+  int read(object_t oid, size_t len, 
+           off_t offset, bufferlist& bl);
+  int write(object_t oid, size_t len, 
+            off_t offset, bufferlist& bl,
+            bool fsync);
+  int write(object_t oid, size_t len, 
+            off_t offset, bufferlist& bl,
+            Context *onflush);
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/OSD.cc b/branches/sage/cephmds2/osd/OSD.cc
new file mode 100644
index 0000000000000..67e84746229b0
--- /dev/null
+++ b/branches/sage/cephmds2/osd/OSD.cc
@@ -0,0 +1,3498 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "include/types.h"
+
+#include "OSD.h"
+#include "OSDMap.h"
+
+#ifdef USE_OBFS
+# include "OBFSStore.h"
+#else
+# include "FakeStore.h"
+#endif
+
+#include "ebofs/Ebofs.h"
+
+#include "Ager.h"
+
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#include "common/Logger.h"
+#include "common/LogType.h"
+#include "common/Timer.h"
+#include "common/ThreadPool.h"
+
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <sys/stat.h>
+
+
+#include "config.h"
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+#define  derr(l)    if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+
+char *osd_base_path = "./osddata";
+char *ebofs_base_path = "./dev";
+
+
+object_t SUPERBLOCK_OBJECT(0,0);
+
+
+// <hack> force remount hack for performance testing FakeStore
+class C_Remount : public Context {
+  OSD *osd;
+public:
+  C_Remount(OSD *o) : osd(o) {}
+  void finish(int) {
+    osd->force_remount();
+  }
+};
+
+void OSD::force_remount()
+{
+  dout(0) << "forcing remount" << endl;
+  osd_lock.Lock();
+  {
+    store->umount();
+    store->mount();
+  }
+  osd_lock.Unlock();
+  dout(0) << "finished remount" << endl;
+}
+// </hack>
+
+
+// cons/des
+
+LogType osd_logtype;
+
+OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) 
+{
+  whoami = id;
+  messenger = m;
+  monmap = mm;
+
+  osdmap = 0;
+  boot_epoch = 0;
+
+  last_tid = 0;
+  num_pulling = 0;
+
+  state = STATE_BOOTING;
+
+  hb_stat_ops = 0;
+  hb_stat_qlen = 0;
+
+  pending_ops = 0;
+  waiting_for_no_ops = false;
+
+  if (g_conf.osd_remount_at) 
+    g_timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this));
+
+                                           
+
+  // init object store
+  // try in this order:
+  // dev/osd$num
+  // dev/osd.$hostname
+  // dev/osd.all
+
+  if (dev) {
+    strcpy(dev_path,dev);
+  } else {
+    char hostname[100];
+    hostname[0] = 0;
+    gethostname(hostname,100);
+    
+    sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami);
+    
+    struct stat sta;
+    if (::lstat(dev_path, &sta) != 0)
+      sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname);    
+    
+    if (::lstat(dev_path, &sta) != 0)
+      sprintf(dev_path, "%s/osd.all", ebofs_base_path);
+  }
+
+  if (g_conf.ebofs) {
+    store = new Ebofs(dev_path);
+    //store->_fake_writes(true);
+  }
+#ifdef USE_OBFS
+  else if (g_conf.uofs) {
+    store = new OBFSStore(whoami, NULL, dev_path);
+  }
+#endif
+  else {
+    store = new FakeStore(osd_base_path, whoami); 
+  }
+
+}
+
+OSD::~OSD()
+{
+  if (threadpool) { delete threadpool; threadpool = 0; }
+  if (osdmap) { delete osdmap; osdmap = 0; }
+  //if (monitor) { delete monitor; monitor = 0; }
+  if (messenger) { delete messenger; messenger = 0; }
+  if (logger) { delete logger; logger = 0; }
+  if (store) { delete store; store = 0; }
+}
+
+int OSD::init()
+{
+  osd_lock.Lock();
+  {
+    // mkfs?
+    if (g_conf.osd_mkfs) {
+      dout(2) << "mkfs" << endl;
+      store->mkfs();
+
+      // make up a superblock
+      //superblock.fsid = ???;
+      superblock.whoami = whoami;
+    }
+    
+    // mount.
+    dout(2) << "mounting " << dev_path << endl;
+    int r = store->mount();
+    assert(r>=0);
+
+    if (g_conf.osd_mkfs) {
+      // age?
+      if (g_conf.osd_age_time != 0) {
+        dout(2) << "age" << endl;
+        Ager ager(store);
+        if (g_conf.osd_age_time < 0) 
+          ager.load_freelist();
+        else 
+          ager.age(g_conf.osd_age_time, 
+                   g_conf.osd_age, 
+                   g_conf.osd_age - .05, 
+                   50000, 
+                   g_conf.osd_age - .05);
+      }
+    }
+    else {
+      dout(2) << "boot" << endl;
+      
+      // read superblock
+      read_superblock();
+
+      // load up pgs (as they previously existed)
+      load_pgs();
+
+      dout(2) << "superblock: i am osd" << superblock.whoami << endl;
+      assert(whoami == superblock.whoami);
+    }
+
+    
+    // log
+    char name[80];
+    sprintf(name, "osd%02d", whoami);
+    logger = new Logger(name, (LogType*)&osd_logtype);
+    osd_logtype.add_set("opq");
+    osd_logtype.add_inc("op");
+    osd_logtype.add_inc("c_rd");
+    osd_logtype.add_inc("c_rdb");
+    osd_logtype.add_inc("c_wr");
+    osd_logtype.add_inc("c_wrb");
+    
+    osd_logtype.add_inc("r_push");
+    osd_logtype.add_inc("r_pushb");
+    osd_logtype.add_inc("r_wr");
+    osd_logtype.add_inc("r_wrb");
+    
+    osd_logtype.add_inc("rlnum");
+
+    osd_logtype.add_set("numpg");
+    osd_logtype.add_set("pingset");
+
+    osd_logtype.add_set("buf");
+
+    osd_logtype.add_inc("map");
+    osd_logtype.add_inc("mapi");
+    osd_logtype.add_inc("mapidup");
+    osd_logtype.add_inc("mapf");
+    osd_logtype.add_inc("mapfdup");
+    
+    // request thread pool
+    {
+      char name[80];
+      sprintf(name,"osd%d.threadpool", whoami);
+      threadpool = new ThreadPool<OSD*, pg_t>(name, g_conf.osd_maxthreads, 
+                                              static_dequeueop,
+                                              this);
+    }
+    
+    // i'm ready!
+    messenger->set_dispatcher(this);
+    
+    // announce to monitor i exist and have booted.
+    int mon = monmap->pick_mon();
+    messenger->send_message(new MOSDBoot(superblock), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+    
+    // start the heart
+    next_heartbeat = new C_Heartbeat(this);
+    g_timer.add_event_after(g_conf.osd_heartbeat_interval, next_heartbeat);
+  }
+  osd_lock.Unlock();
+
+  //dout(0) << "osd_rep " << g_conf.osd_rep << endl;
+
+  return 0;
+}
+
+int OSD::shutdown()
+{
+  dout(1) << "shutdown, timer has " << g_timer.num_event << endl;
+
+  if (next_heartbeat) g_timer.cancel_event(next_heartbeat);
+
+  state = STATE_STOPPING;
+
+  // finish ops
+  wait_for_no_ops();
+
+  // stop threads
+  delete threadpool;
+  threadpool = 0;
+
+  // close pgs
+  for (hash_map<pg_t, PG*>::iterator p = pg_map.begin();
+       p != pg_map.end();
+       p++) {
+    delete p->second;
+  }
+  pg_map.clear();
+
+  // shut everything else down
+  //monitor->shutdown();
+  messenger->shutdown();
+
+  osd_lock.Unlock();
+  int r = store->umount();
+  osd_lock.Lock();
+  return r;
+}
+
+
+
+void OSD::write_superblock(ObjectStore::Transaction& t)
+{
+  dout(10) << "write_superblock " << superblock << endl;
+
+  bufferlist bl;
+  bl.append((char*)&superblock, sizeof(superblock));
+  t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+}
+
+int OSD::read_superblock()
+{
+  bufferlist bl;
+  int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+  if (bl.length() != sizeof(superblock)) {
+    dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl;
+    return -1;
+  }
+
+  bl.copy(0, sizeof(superblock), (char*)&superblock);
+  
+  dout(10) << "read_superblock " << superblock << endl;
+
+  // load up "current" osdmap
+  assert(!osdmap);
+  osdmap = new OSDMap;
+  bl.clear();
+  get_map_bl(superblock.current_epoch, bl);
+  osdmap->decode(bl);
+
+  assert(whoami == superblock.whoami);  // fixme!
+  return 0;
+}
+
+
+// object locks
+
+PG *OSD::lock_pg(pg_t pgid) 
+{
+  osd_lock.Lock();
+  PG *pg = _lock_pg(pgid);
+  osd_lock.Unlock();
+  return pg;
+}
+
+PG *OSD::_lock_pg(pg_t pgid)
+{
+  assert(pg_map.count(pgid));
+
+  if (pg_lock.count(pgid)) {
+    Cond c;
+    dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl;
+    //cerr << "lock_pg " << pgid << " waiting as " << &c << endl;
+
+    list<Cond*>& ls = pg_lock_waiters[pgid];   // this is commit, right?
+    ls.push_back(&c);
+    
+    while (pg_lock.count(pgid) ||
+           ls.front() != &c)
+      c.Wait(osd_lock);
+
+    assert(ls.front() == &c);
+    ls.pop_front();
+    if (ls.empty())
+      pg_lock_waiters.erase(pgid);
+  }
+
+  dout(15) << "lock_pg " << pgid << endl;
+  pg_lock.insert(pgid);
+
+  return pg_map[pgid];  
+}
+
+void OSD::unlock_pg(pg_t pgid) 
+{
+  osd_lock.Lock();
+  _unlock_pg(pgid);
+  osd_lock.Unlock();
+}
+
+void OSD::_unlock_pg(pg_t pgid) 
+{
+  // unlock
+  assert(pg_lock.count(pgid));
+  pg_lock.erase(pgid);
+
+  if (pg_lock_waiters.count(pgid)) {
+    // someone is in line
+    Cond *c = pg_lock_waiters[pgid].front();
+    assert(c);
+    dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl;
+    c->Signal();
+  } else {
+    // nobody waiting
+    dout(15) << "unlock_pg " << pgid << endl;
+  }
+}
+
+void OSD::_remove_pg(pg_t pgid) 
+{
+  dout(10) << "_remove_pg " << pgid << endl;
+
+  // remove from store
+  list<object_t> olist;
+  store->collection_list(pgid, olist);
+  
+  ObjectStore::Transaction t;
+  {
+    for (list<object_t>::iterator p = olist.begin();
+         p != olist.end();
+         p++)
+      t.remove(*p);
+    t.remove_collection(pgid);
+    t.remove(object_t(1,pgid));  // log too
+  }
+  store->apply_transaction(t);
+  
+  // hose from memory
+  delete pg_map[pgid];
+  pg_map.erase(pgid);
+}
+
+
+void OSD::activate_pg(pg_t pgid, epoch_t epoch)
+{
+  osd_lock.Lock();
+  {
+    if (pg_map.count(pgid)) {
+      PG *pg = _lock_pg(pgid);
+      if (pg->is_crashed() &&
+          pg->is_replay() &&
+          pg->get_role() == 0 &&
+          pg->info.history.same_primary_since <= epoch) {
+        ObjectStore::Transaction t;
+        pg->activate(t);
+        store->apply_transaction(t);
+      }
+      _unlock_pg(pgid);
+    }
+  }
+
+  // finishers?
+  if (finished.empty()) {
+    osd_lock.Unlock();
+  } else {
+    list<Message*> waiting;
+    waiting.splice(waiting.begin(), finished);
+
+    osd_lock.Unlock();
+    
+    for (list<Message*>::iterator it = waiting.begin();
+         it != waiting.end();
+         it++) {
+      dispatch(*it);
+    }
+  }
+}
+
+
+// -------------------------------------
+
+void OSD::heartbeat()
+{
+  osd_lock.Lock();
+
+  utime_t now = g_clock.now();
+  utime_t since = now;
+  since.sec_ref() -= g_conf.osd_heartbeat_interval;
+
+  // calc my stats
+  float avg_qlen = 0;
+  if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops;
+
+  dout(5) << "heartbeat " << now 
+	  << ": ops " << hb_stat_ops
+	  << ", avg qlen " << avg_qlen
+	  << endl;
+  
+  // reset until next time around
+  hb_stat_ops = 0;
+  hb_stat_qlen = 0;
+
+  // send pings
+  set<int> pingset;
+  for (hash_map<pg_t, PG*>::iterator i = pg_map.begin();
+       i != pg_map.end();
+       i++) {
+    PG *pg = i->second;
+
+    // we want to ping the primary.
+    if (pg->get_role() <= 0) continue;   
+    if (pg->acting.size() < 1) continue; 
+
+    if (pg->last_heartbeat < since) {
+      pg->last_heartbeat = now;
+      pingset.insert(pg->acting[0]);
+    }
+  }
+  for (set<int>::iterator i = pingset.begin();
+       i != pingset.end();
+       i++) {
+    _share_map_outgoing( MSG_ADDR_OSD(*i), osdmap->get_inst(*i) );
+    messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), 
+                            MSG_ADDR_OSD(*i), osdmap->get_inst(*i));
+  }
+
+  if (logger) logger->set("pingset", pingset.size());
+
+  // hack: fake reorg?
+  if (osdmap && g_conf.fake_osdmap_updates) {
+    int mon = monmap->pick_mon();
+    if ((rand() % g_conf.fake_osdmap_updates) == 0) {
+      //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) {
+      messenger->send_message(new MOSDIn(osdmap->get_epoch()),
+                              MSG_ADDR_MON(mon), monmap->get_inst(mon));
+    }
+    /*
+      if (osdmap->is_out(whoami)) {
+      messenger->send_message(new MOSDIn(osdmap->get_epoch()),
+                              MSG_ADDR_MON(mon), monmap->get_inst(mon));
+      } 
+      else if ((rand() % g_conf.fake_osdmap_updates) == 0) {
+      //messenger->send_message(new MOSDOut(osdmap->get_epoch()),
+      //MSG_ADDR_MON(mon), monmap->get_inst(mon));
+      }
+    }
+    */
+  }
+
+  // schedule next!  randomly.
+  next_heartbeat = new C_Heartbeat(this);
+  float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval;
+  g_timer.add_event_after(wait, next_heartbeat);
+
+  osd_lock.Unlock();  
+}
+
+
+
+// --------------------------------------
+// dispatch
+
+bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch)
+{
+  bool shared = false;
+
+  // does client have old map?
+  if (who.is_client()) {
+    if (epoch < osdmap->get_epoch()) {
+      dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+      send_incremental_map(epoch, who, inst, true);
+      shared = true;
+    }
+  }
+
+  // does peer have old map?
+  if (who.is_osd()) {
+    // remember
+    if (peer_map_epoch[who] < epoch)
+      peer_map_epoch[who] = epoch;
+    
+    // older?
+    if (peer_map_epoch[who] < osdmap->get_epoch()) {
+      dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+      send_incremental_map(epoch, who, inst, true);
+      peer_map_epoch[who] = osdmap->get_epoch();  // so we don't send it again.
+      shared = true;
+    }
+  }
+
+  return shared;
+}
+
+
+void OSD::_share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst) 
+{
+  assert(dest.is_osd());
+
+  if (dest.is_osd()) {
+    // send map?
+    if (peer_map_epoch.count(dest)) {
+      epoch_t pe = peer_map_epoch[dest];
+      if (pe < osdmap->get_epoch()) {
+        send_incremental_map(pe, dest, inst, true);
+        peer_map_epoch[dest] = osdmap->get_epoch();
+      }
+    } else {
+      // no idea about peer's epoch.
+      // ??? send recent ???
+      // do nothing.
+    }
+  }
+}
+
+
+
+void OSD::dispatch(Message *m) 
+{
+  // lock!
+  osd_lock.Lock();
+
+  switch (m->get_type()) {
+
+    // -- don't need lock -- 
+  case MSG_PING:
+    dout(10) << "ping from " << m->get_source() << endl;
+    delete m;
+    break;
+
+    // -- don't need OSDMap --
+
+    /*
+    // host monitor
+  case MSG_PING_ACK:
+  case MSG_FAILURE_ACK:
+    monitor->proc_message(m);
+    break;
+    */
+
+    // map and replication
+  case MSG_OSD_MAP:
+    handle_osd_map((MOSDMap*)m);
+    break;
+
+    // osd
+  case MSG_SHUTDOWN:
+    shutdown();
+    delete m;
+    break;
+    
+    
+
+    // -- need OSDMap --
+
+  default:
+    {
+      // no map?  starting up?
+      if (!osdmap) {
+        dout(7) << "no OSDMap, not booted" << endl;
+        waiting_for_osdmap.push_back(m);
+        break;
+      }
+      
+      // down?
+      if (osdmap->is_down(whoami)) {
+        dout(7) << "i am marked down, dropping " << *m << endl;
+        delete m;
+        break;
+      }
+
+
+      
+
+      // need OSDMap
+      switch (m->get_type()) {
+
+      case MSG_OSD_PING:
+        // take note.
+        handle_osd_ping((MOSDPing*)m);
+        break;
+        
+      case MSG_OSD_PG_NOTIFY:
+        handle_pg_notify((MOSDPGNotify*)m);
+        break;
+      case MSG_OSD_PG_QUERY:
+        handle_pg_query((MOSDPGQuery*)m);
+        break;
+      case MSG_OSD_PG_LOG:
+        handle_pg_log((MOSDPGLog*)m);
+        break;
+      case MSG_OSD_PG_REMOVE:
+        handle_pg_remove((MOSDPGRemove*)m);
+        break;
+
+      case MSG_OSD_OP:
+        handle_op((MOSDOp*)m);
+        break;
+        
+        // for replication etc.
+      case MSG_OSD_OPREPLY:
+        handle_op_reply((MOSDOpReply*)m);
+        break;
+        
+        
+      default:
+        dout(1) << " got unknown message " << m->get_type() << endl;
+        assert(0);
+      }
+    }
+  }
+
+  // finishers?
+  if (!finished.empty()) {
+    list<Message*> waiting;
+    waiting.splice(waiting.begin(), finished);
+
+    osd_lock.Unlock();
+    
+    for (list<Message*>::iterator it = waiting.begin();
+         it != waiting.end();
+         it++) {
+      dispatch(*it);
+    }
+    return;
+  }
+  
+  osd_lock.Unlock();
+}
+
+
+void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+  if (g_conf.ms_die_on_failure) {
+    exit(0);
+  }
+
+  if (dest.is_osd()) {
+    // failed osd.  drop message, report to mon.
+    int mon = monmap->pick_mon();
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", dropping and reporting to mon" << mon 
+            << endl;
+    messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()),
+                            MSG_ADDR_MON(mon), monmap->get_inst(mon));
+    delete m;
+  } else if (dest.is_mon()) {
+    // resend to a different monitor.
+    int mon = monmap->pick_mon(true);
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", resending to mon" << mon 
+            << endl;
+    messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  }
+  else {
+    // client?
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", dropping" << endl;
+    delete m;
+  }
+}
+
+bool OSD::ms_lookup(msg_addr_t dest, entity_inst_t& inst)
+{
+  if (dest.is_osd()) {
+    assert(osdmap);
+    return osdmap->get_inst(dest.num(), inst);
+  } 
+
+  assert(0);
+  return false;
+}
+
+
+
+
+void OSD::handle_osd_ping(MOSDPing *m)
+{
+  dout(20) << "osdping from " << m->get_source() << endl;
+  _share_map_incoming(m->get_source(), m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
+  
+  int from = m->get_source().num();
+  peer_qlen[from] = m->avg_qlen;
+
+  //if (!m->ack)
+  //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true),
+  //m->get_source());
+ 
+  delete m;
+}
+
+
+
+
+// =====================================================
+// MAP
+
+void OSD::wait_for_new_map(Message *m)
+{
+  // ask 
+  if (waiting_for_osdmap.empty()) {
+    int mon = monmap->pick_mon();
+    messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
+                            MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  }
+  
+  waiting_for_osdmap.push_back(m);
+}
+
+
+/** update_map
+ * assimilate new OSDMap(s).  scan pgs, etc.
+ */
+void OSD::handle_osd_map(MOSDMap *m)
+{
+  wait_for_no_ops();
+  
+  assert(osd_lock.is_locked());
+
+  ObjectStore::Transaction t;
+  
+  if (osdmap) {
+    dout(3) << "handle_osd_map epochs [" 
+            << m->get_first() << "," << m->get_last() 
+            << "], i have " << osdmap->get_epoch()
+            << endl;
+  } else {
+    dout(3) << "handle_osd_map epochs [" 
+            << m->get_first() << "," << m->get_last() 
+            << "], i have none"
+            << endl;
+    osdmap = new OSDMap;
+    boot_epoch = m->get_last(); // hrm...?
+  }
+
+  logger->inc("mapmsg");
+
+  // store them?
+  for (map<epoch_t,bufferlist>::iterator p = m->maps.begin();
+       p != m->maps.end();
+       p++) {
+    object_t oid = get_osdmap_object_name(p->first);
+    if (store->exists(oid)) {
+      dout(10) << "handle_osd_map already had full map epoch " << p->first << endl;
+      logger->inc("mapfdup");
+      bufferlist bl;
+      get_map_bl(p->first, bl);
+      dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+      continue;
+    }
+
+    dout(10) << "handle_osd_map got full map epoch " << p->first << endl;
+    //t.write(oid, 0, p->second.length(), p->second);
+    store->write(oid, 0, p->second.length(), p->second, 0);
+
+    if (p->first > superblock.newest_map)
+      superblock.newest_map = p->first;
+    if (p->first < superblock.oldest_map ||
+        superblock.oldest_map == 0)
+      superblock.oldest_map = p->first;
+
+    logger->inc("mapf");
+  }
+  for (map<epoch_t,bufferlist>::iterator p = m->incremental_maps.begin();
+       p != m->incremental_maps.end();
+       p++) {
+    object_t oid = get_inc_osdmap_object_name(p->first);
+    if (store->exists(oid)) {
+      dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl;
+      logger->inc("mapidup");
+      bufferlist bl;
+      get_inc_map_bl(p->first, bl);
+      dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+      continue;
+    }
+
+    dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl;
+    //t.write(oid, 0, p->second.length(), p->second);
+    store->write(oid, 0, p->second.length(), p->second, 0);
+
+    if (p->first > superblock.newest_map)
+      superblock.newest_map = p->first;
+    if (p->first < superblock.oldest_map ||
+        superblock.oldest_map == 0)
+      superblock.oldest_map = p->first;
+
+    logger->inc("mapi");
+  }
+
+  // advance if we can
+  bool advanced = false;
+  
+  if (m->get_source().is_mon() && is_booting()) 
+    advanced = true;
+
+  epoch_t cur = superblock.current_epoch;
+  while (cur < superblock.newest_map) {
+    bufferlist bl;
+    if (m->incremental_maps.count(cur+1) ||
+        store->exists(get_inc_osdmap_object_name(cur+1))) {
+      dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl;
+      
+      bufferlist bl;
+      if (m->incremental_maps.count(cur+1))
+        bl = m->incremental_maps[cur+1];
+      else
+        get_inc_map_bl(cur+1, bl);
+
+      OSDMap::Incremental inc;
+      int off = 0;
+      inc.decode(bl, off);
+
+      osdmap->apply_incremental(inc);
+
+      // archive the full map
+      bl.clear();
+      osdmap->encode(bl);
+      t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl);
+
+      // notify messenger
+      for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+           i != inc.new_down.end();
+           i++) {
+        int osd = i->first;
+        if (osd == whoami) continue;
+        messenger->mark_down(MSG_ADDR_OSD(osd), i->second);
+        peer_map_epoch.erase(MSG_ADDR_OSD(osd));
+      
+        // kick any replica ops
+        for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+             it != pg_map.end();
+             it++) {
+          PG *pg = it->second;
+
+          _lock_pg(pg->info.pgid);
+          {
+            list<PG::RepOpGather*> ls;  // do async; repop_ack() may modify pg->repop_gather
+            for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
+                 p != pg->repop_gather.end();
+                 p++) {
+              //dout(-1) << "checking repop tid " << p->first << endl;
+              if (p->second->waitfor_ack.count(osd) ||
+                  p->second->waitfor_commit.count(osd)) 
+                ls.push_back(p->second);
+            }
+            for (list<PG::RepOpGather*>::iterator p = ls.begin();
+                 p != ls.end();
+                 p++)
+              repop_ack(pg, *p, -1, true, osd);
+          }
+          _unlock_pg(pg->info.pgid);
+        }
+      }
+      for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+           i != inc.new_up.end();
+           i++) {
+        if (i->first == whoami) continue;
+        messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+        peer_map_epoch.erase(MSG_ADDR_OSD(i->first));
+      }
+    }
+    else if (m->maps.count(cur+1) ||
+             store->exists(get_osdmap_object_name(cur+1))) {
+      dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl;
+      bufferlist bl;
+      if (m->maps.count(cur+1))
+        bl = m->maps[cur+1];
+      else
+        get_map_bl(cur+1, bl);
+      osdmap->decode(bl);
+
+      // FIXME BUG: need to notify messenger of ups/downs!!
+    }
+    else {
+      dout(10) << "handle_osd_map missing epoch " << cur+1 << endl;
+      int mon = monmap->pick_mon();
+      messenger->send_message(new MOSDGetMap(cur), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+      break;
+    }
+
+    cur++;
+    superblock.current_epoch = cur;
+    advance_map(t);
+    advanced = true;
+  }
+
+  // all the way?
+  if (advanced && cur == superblock.newest_map) {
+    // yay!
+    activate_map(t);
+    
+    // process waiters
+    take_waiters(waiting_for_osdmap);
+  }
+
+  // write updated pg state to store
+  for (hash_map<pg_t,PG*>::iterator i = pg_map.begin();
+       i != pg_map.end();
+       i++) {
+    pg_t pgid = i->first;
+    PG *pg = i->second;
+    t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info));
+  }
+
+  // superblock and commit
+  write_superblock(t);
+  store->apply_transaction(t);
+
+  //if (osdmap->get_epoch() == 1) store->sync();     // in case of early death, blah
+
+  delete m;
+}
+
+
+/** 
+ * scan placement groups, initiate any replication
+ * activities.
+ */
+void OSD::advance_map(ObjectStore::Transaction& t)
+{
+  dout(7) << "advance_map epoch " << osdmap->get_epoch() 
+          << "  " << pg_map.size() << " pgs"
+          << endl;
+  
+  if (osdmap->is_mkfs()) {
+    ps_t maxps = 1ULL << osdmap->get_pg_bits();
+    ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits();
+    dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl;
+    assert(osdmap->get_epoch() == 1);
+
+    //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl;
+    logger->set_start( osdmap->get_ctime() );
+
+    // create PGs
+    for (int nrep = 1; 
+         nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep);    // for low osd counts..  hackish bleh
+         nrep++) {
+      for (ps_t ps = 0; ps < maxps; ++ps) {
+	vector<int> acting;
+	pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep);
+	int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+	int role = osdmap->calc_pg_role(whoami, acting, nrep);
+	if (role < 0) continue;
+	
+	PG *pg = create_pg(pgid, t);
+	pg->set_role(role);
+	pg->acting.swap(acting);
+	pg->last_epoch_started_any = 
+	  pg->info.last_epoch_started = 
+	  pg->info.history.same_since = 
+	  pg->info.history.same_primary_since = 
+	    pg->info.history.same_acker_since = osdmap->get_epoch();
+	pg->activate(t);
+	
+	dout(7) << "created " << *pg << endl;
+      }
+
+      for (ps_t ps = 0; ps < maxlps; ++ps) {
+	// local PG too
+	vector<int> acting;
+	pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep);
+	int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+	int role = osdmap->calc_pg_role(whoami, acting, nrep);
+	
+	PG *pg = create_pg(pgid, t);
+	pg->acting.swap(acting);
+	pg->set_role(role);
+	pg->last_epoch_started_any = 
+	  pg->info.last_epoch_started = 
+	  pg->info.history.same_primary_since = 
+	  pg->info.history.same_acker_since = 
+	  pg->info.history.same_since = osdmap->get_epoch();
+	pg->activate(t);
+	
+	dout(7) << "created " << *pg << endl;
+      }
+    }
+
+    dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl;
+
+  } else {
+    // scan existing pg's
+    for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+         it != pg_map.end();
+         it++) {
+      pg_t pgid = it->first;
+      PG *pg = it->second;
+      
+      // did i finish this epoch?
+      if (pg->is_active()) {
+        pg->info.last_epoch_finished = osdmap->get_epoch()-1;
+      }      
+
+      // get new acting set
+      vector<int> tacting;
+      int nrep = osdmap->pg_to_acting_osds(pgid, tacting);
+      int role = osdmap->calc_pg_role(whoami, tacting, nrep);
+
+      // no change?
+      if (tacting == pg->acting) 
+        continue;
+
+      // -- there was a change! --
+      _lock_pg(pgid);
+      
+      int oldrole = pg->get_role();
+      int oldprimary = pg->get_primary();
+      int oldacker = pg->get_acker();
+      vector<int> oldacting = pg->acting;
+      
+      // update PG
+      pg->acting.swap(tacting);
+      pg->set_role(role);
+      
+      // did primary|acker change?
+      pg->info.history.same_since = osdmap->get_epoch();
+      if (oldprimary != pg->get_primary()) {
+        pg->info.history.same_primary_since = osdmap->get_epoch();
+        pg->cancel_recovery();
+      }
+      if (oldacker != pg->get_acker()) {
+        pg->info.history.same_acker_since = osdmap->get_epoch();
+      }
+
+      // deactivate.
+      pg->state_clear(PG::STATE_ACTIVE);
+      
+      // reset primary state?
+      if (oldrole == 0 || pg->get_role() == 0)
+        pg->clear_primary_state();
+      
+      // apply any repops in progress.
+      if (oldacker == whoami) {
+        // apply repops
+        for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
+             p != pg->repop_gather.end();
+             p++) {
+          if (!p->second->applied)
+            apply_repop(pg, p->second);
+          delete p->second->op;
+          delete p->second;
+        }
+        pg->repop_gather.clear();
+        
+        // and repop waiters
+        for (map<tid_t, list<Message*> >::iterator p = pg->waiting_for_repop.begin();
+             p != pg->waiting_for_repop.end();
+             p++)
+          for (list<Message*>::iterator pm = p->second.begin();
+               pm != p->second.end();
+               pm++)
+            delete *pm;
+        pg->waiting_for_repop.clear();
+      }
+
+      if (role != oldrole) {
+        // old primary?
+        if (oldrole == 0) {
+          pg->state_clear(PG::STATE_CLEAN);
+
+          // take replay queue waiters
+          list<Message*> ls;
+          for (map<eversion_t,MOSDOp*>::iterator it = pg->replay_queue.begin();
+               it != pg->replay_queue.end();
+               it++)
+            ls.push_back(it->second);
+          pg->replay_queue.clear();
+          take_waiters(ls);
+
+          // take active waiters
+          take_waiters(pg->waiting_for_active);
+          
+          // take object waiters
+          for (hash_map<object_t, list<Message*> >::iterator it = pg->waiting_for_missing_object.begin();
+               it != pg->waiting_for_missing_object.end();
+               it++)
+            take_waiters(it->second);
+          pg->waiting_for_missing_object.clear();
+        }
+        
+        // new primary?
+        if (role == 0) {
+          // i am new primary
+          pg->state_clear(PG::STATE_STRAY);
+        } else {
+          // i am now replica|stray.  we need to send a notify.
+          pg->state_set(PG::STATE_STRAY);
+
+          if (nrep == 0) {
+            pg->state_set(PG::STATE_CRASHED);
+            dout(1) << *pg << " is crashed" << endl;
+          }
+        }
+        
+        // my role changed.
+        dout(10) << *pg << " " << oldacting << " -> " << pg->acting 
+                 << ", role " << oldrole << " -> " << role << endl; 
+        
+      } else {
+        // no role change.
+        // did primary change?
+        if (pg->get_primary() != oldprimary) {    
+          // we need to announce
+          pg->state_set(PG::STATE_STRAY);
+          
+          dout(10) << *pg << " " << oldacting << " -> " << pg->acting 
+                   << ", acting primary " 
+                   << oldprimary << " -> " << pg->get_primary() 
+                   << endl;
+        } else {
+          // primary is the same.
+          if (role == 0) {
+            // i am (still) primary. but my replica set changed.
+            pg->state_clear(PG::STATE_CLEAN);
+            pg->state_clear(PG::STATE_REPLAY);
+
+            dout(10) << *pg << " " << oldacting << " -> " << pg->acting
+                     << ", replicas changed" << endl;
+          }
+        }
+      }
+      
+
+      _unlock_pg(pgid);
+    }
+  }
+}
+
+void OSD::activate_map(ObjectStore::Transaction& t)
+{
+  dout(7) << "activate_map version " << osdmap->get_epoch() << endl;
+
+  map< int, list<PG::Info> >  notify_list;  // primary -> list
+  map< int, map<pg_t,PG::Query> > query_map;    // peer -> PG -> get_summary_since
+
+  // scan pg's
+  for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+       it != pg_map.end();
+       it++) {
+    //pg_t pgid = it->first;
+    PG *pg = it->second;
+
+    if (pg->is_active()) {
+      // update started counter
+      pg->info.last_epoch_started = osdmap->get_epoch();
+    } 
+    else if (pg->get_role() == 0 && !pg->is_active()) {
+      // i am (inactive) primary
+      pg->build_prior();
+      pg->peer(t, query_map);
+    }
+    else if (pg->is_stray() &&
+             pg->get_primary() >= 0) {
+      // i am residual|replica
+      notify_list[pg->get_primary()].push_back(pg->info);
+    }
+
+  }  
+
+  if (osdmap->is_mkfs())    // hack: skip the queries/summaries if it's a mkfs
+    return;
+
+  // notify? (residual|replica)
+  do_notifies(notify_list);
+  
+  // do queries.
+  do_queries(query_map);
+
+  logger->set("numpg", pg_map.size());
+}
+
+
+void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full)
+{
+  dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
+           << " to " << dest << endl;
+  
+  MOSDMap *m = new MOSDMap;
+  
+  for (epoch_t e = osdmap->get_epoch();
+       e > since;
+       e--) {
+    bufferlist bl;
+    if (get_inc_map_bl(e,bl)) {
+      m->incremental_maps[e].claim(bl);
+    } else if (get_map_bl(e,bl)) {
+      m->maps[e].claim(bl);
+      if (!full) break;
+    }
+    else {
+      assert(0);  // we should have all maps.
+    }
+  }
+
+  messenger->send_message(m, dest, inst);
+}
+
+bool OSD::get_map_bl(epoch_t e, bufferlist& bl)
+{
+  return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0;
+}
+
+bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl)
+{
+  return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0;
+}
+
+void OSD::get_map(epoch_t epoch, OSDMap &m)
+{
+  // find a complete map
+  list<OSDMap::Incremental> incs;
+  epoch_t e;
+  for (e = epoch; e > 0; e--) {
+    bufferlist bl;
+    if (get_map_bl(e, bl)) {
+      //dout(10) << "get_map " << epoch << " full " << e << endl;
+      m.decode(bl);
+      break;
+    } else {
+      OSDMap::Incremental inc;
+      bool got = get_inc_map(e, inc);
+      assert(got);
+      incs.push_front(inc);
+    }
+  }
+  assert(e > 0);
+
+  // apply incrementals
+  for (e++; e <= epoch; e++) {
+    //dout(10) << "get_map " << epoch << " inc " << e << endl;
+    m.apply_incremental( incs.front() );
+    incs.pop_front();
+  }
+}
+
+
+bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc)
+{
+  bufferlist bl;
+  if (!get_inc_map_bl(e, bl)) 
+    return false;
+  int off = 0;
+  inc.decode(bl, off);
+  return true;
+}
+
+
+
+
+
+bool OSD::require_current_map(Message *m, epoch_t ep) 
+{
+  // older map?
+  if (ep < osdmap->get_epoch()) {
+    dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl;
+    delete m;   // discard and ignore.
+    return false;
+  }
+
+  // newer map?
+  if (ep > osdmap->get_epoch()) {
+    dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl;
+    wait_for_new_map(m);
+    return false;
+  }
+
+  assert(ep == osdmap->get_epoch());
+  return true;
+}
+
+
+/*
+ * require that we have same (or newer) map, and that
+ * the source is the pg primary.
+ */
+bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
+{
+  dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl;
+
+  // newer map?
+  if (epoch > osdmap->get_epoch()) {
+    dout(7) << "  from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl;
+    wait_for_new_map(m);
+    return false;
+  }
+
+  if (epoch < boot_epoch) {
+    dout(7) << "  from pre-boot epoch " << epoch << " < " << boot_epoch << endl;
+    delete m;
+    return false;
+  }
+
+  return true;
+}
+
+
+
+
+// ======================================================
+// REPLICATION
+
+// PG
+
+bool OSD::pg_exists(pg_t pgid) 
+{
+  return store->collection_exists(pgid);
+}
+
+PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t)
+{
+  if (pg_map.count(pgid)) {
+    dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl;
+  }
+  assert(pg_map.count(pgid) == 0);
+  assert(!pg_exists(pgid));
+
+  PG *pg = new PG(this, pgid);
+  pg_map[pgid] = pg;
+
+  t.create_collection(pgid);
+
+  return pg;
+}
+
+
+
+
+PG *OSD::get_pg(pg_t pgid)
+{
+  if (pg_map.count(pgid))
+    return pg_map[pgid];
+  return 0;
+}
+
+void OSD::load_pgs()
+{
+  dout(10) << "load_pgs" << endl;
+  assert(pg_map.empty());
+
+  list<coll_t> ls;
+  store->list_collections(ls);
+
+  for (list<coll_t>::iterator it = ls.begin();
+       it != ls.end();
+       it++) {
+    pg_t pgid = *it;
+
+    PG *pg = new PG(this, pgid);
+    pg_map[pgid] = pg;
+
+    // read pg info
+    store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info));
+    
+    // read pg log
+    pg->read_log(store);
+
+    // generate state for current mapping
+    int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting);
+    int role = osdmap->calc_pg_role(whoami, pg->acting, nrep);
+    pg->set_role(role);
+
+    dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl;
+  }
+}
+ 
+/**
+ * check epochs starting from start to verify the pg acting set hasn't changed
+ * up until now
+ */
+void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from)
+{
+  dout(15) << "project_pg_history " << pgid
+           << " from " << from << " to " << osdmap->get_epoch()
+           << ", start " << h
+           << endl;
+
+  vector<int> last;
+  osdmap->pg_to_acting_osds(pgid, last);
+
+  for (epoch_t e = osdmap->get_epoch()-1;
+       e >= from;
+       e--) {
+    // verify during intermediate epoch
+    OSDMap oldmap;
+    get_map(e, oldmap);
+
+    vector<int> acting;
+    oldmap.pg_to_acting_osds(pgid, acting);
+
+    // acting set change?
+    if (acting != last && 
+        e <= h.same_since) {
+      dout(15) << "project_pg_history " << pgid << " changed in " << e+1 
+                << " from " << acting << " -> " << last << endl;
+      h.same_since = e+1;
+    }
+
+    // primary change?
+    if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) &&
+        e <= h.same_primary_since) {
+      dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl;
+      h.same_primary_since = e+1;
+    
+      if (g_conf.osd_rep == OSD_REP_PRIMARY)
+        h.same_acker_since = h.same_primary_since;
+    }
+
+    // acker change?
+    if (g_conf.osd_rep != OSD_REP_PRIMARY) {
+      if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) &&
+          e <= h.same_acker_since) {
+        dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl;
+        h.same_acker_since = e+1;
+      }
+    }
+
+    if (h.same_since > e &&
+        h.same_primary_since > e &&
+        h.same_acker_since > e) break;
+  }
+
+  dout(15) << "project_pg_history end " << h << endl;
+}
+
+
+/** do_notifies
+ * Send an MOSDPGNotify to a primary, with a list of PGs that I have
+ * content for, and they are primary for.
+ */
+
+void OSD::do_notifies(map< int, list<PG::Info> >& notify_list) 
+{
+  for (map< int, list<PG::Info> >::iterator it = notify_list.begin();
+       it != notify_list.end();
+       it++) {
+    if (it->first == whoami) {
+      dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl;
+      continue;
+    }
+    dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl;
+    MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second);
+    _share_map_outgoing(MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
+    messenger->send_message(m, MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
+  }
+}
+
+
+/** do_queries
+ * send out pending queries for info | summaries
+ */
+void OSD::do_queries(map< int, map<pg_t,PG::Query> >& query_map)
+{
+  for (map< int, map<pg_t,PG::Query> >::iterator pit = query_map.begin();
+       pit != query_map.end();
+       pit++) {
+    int who = pit->first;
+    dout(7) << "do_queries querying osd" << who
+            << " on " << pit->second.size() << " PGs" << endl;
+
+    MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(),
+                                     pit->second);
+    _share_map_outgoing(MSG_ADDR_OSD(who), osdmap->get_inst(who));
+    messenger->send_message(m, MSG_ADDR_OSD(who), osdmap->get_inst(who));
+  }
+}
+
+
+
+
+/** PGNotify
+ * from non-primary to primary
+ * includes PG::Info.
+ * NOTE: called with opqueue active.
+ */
+void OSD::handle_pg_notify(MOSDPGNotify *m)
+{
+  dout(7) << "handle_pg_notify from " << m->get_source() << endl;
+  int from = MSG_ADDR_NUM(m->get_source());
+
+  if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+  ObjectStore::Transaction t;
+  
+  // look for unknown PGs i'm primary for
+  map< int, map<pg_t,PG::Query> > query_map;
+
+  for (list<PG::Info>::iterator it = m->get_pg_list().begin();
+       it != m->get_pg_list().end();
+       it++) {
+    pg_t pgid = it->pgid;
+    PG *pg;
+
+    if (pg_map.count(pgid) == 0) {
+      // same primary?
+      PG::Info::History history = it->history;
+      project_pg_history(pgid, history, m->get_epoch());
+
+      if (m->get_epoch() < history.same_primary_since) {
+        dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in "
+                 << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+        continue;
+      }
+      
+      // ok, create PG!
+      pg = create_pg(pgid, t);
+      osdmap->pg_to_acting_osds(pgid, pg->acting);
+      pg->set_role(0);
+      pg->info.history = history;
+
+      pg->last_epoch_started_any = it->last_epoch_started;
+      pg->build_prior();
+
+      t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
+      
+      dout(10) << *pg << " is new" << endl;
+    
+      // kick any waiters
+      if (waiting_for_pg.count(pgid)) {
+        take_waiters(waiting_for_pg[pgid]);
+        waiting_for_pg.erase(pgid);
+      }
+
+      _lock_pg(pgid);
+    } else {
+      // already had it.  am i (still) the primary?
+      pg = _lock_pg(pgid);
+      if (m->get_epoch() < pg->info.history.same_primary_since) {
+        dout(10) << *pg << " handle_pg_notify primary changed in "
+                 << pg->info.history.same_primary_since
+                 << " (msg from " << m->get_epoch() << ")" << endl;
+        _unlock_pg(pgid);
+        continue;
+      }
+    }
+
+    // ok!
+    
+    // stray?
+    bool acting = pg->is_acting(from);
+    if (!acting && (*it).last_epoch_started > 0) {
+      dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl;
+      pg->stray_set.insert(from);
+      pg->state_clear(PG::STATE_CLEAN);
+    }
+
+    // save info.
+    bool had = pg->peer_info.count(from);
+    pg->peer_info[from] = *it;
+
+    if (had) {
+      if (pg->is_active() && 
+          (*it).is_clean() && acting) {
+        pg->clean_set.insert(from);
+        dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set  
+                 << "): " << *it << endl;
+        if (pg->is_all_clean()) {
+          dout(-10) << *pg << " now clean on all replicas" << endl;
+          pg->state_set(PG::STATE_CLEAN);
+          pg->clean_replicas();
+        }
+      } else {
+        // hmm, maybe keep an eye out for cases where we see this, but peer should happen.
+        dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl;
+      }
+    } else {
+      // adjust prior?
+      if (it->last_epoch_started > pg->last_epoch_started_any) 
+        pg->adjust_prior();
+      
+      // peer
+      pg->peer(t, query_map);
+    }
+
+    _unlock_pg(pgid);
+  }
+  
+  unsigned tr = store->apply_transaction(t);
+  assert(tr == 0);
+
+  do_queries(query_map);
+  
+  delete m;
+}
+
+
+
+/** PGLog
+ * from non-primary to primary
+ *  includes log and info
+ * from primary to non-primary
+ *  includes log for use in recovery
+ * NOTE: called with opqueue active.
+ */
+
+void OSD::handle_pg_log(MOSDPGLog *m) 
+{
+  int from = MSG_ADDR_NUM(m->get_source());
+  const pg_t pgid = m->get_pgid();
+
+  if (!require_same_or_newer_map(m, m->get_epoch())) return;
+  if (pg_map.count(pgid) == 0) {
+    dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl;
+    assert(m->get_epoch() < osdmap->get_epoch());
+    delete m;
+    return;
+  }
+
+  PG *pg = _lock_pg(pgid);
+  assert(pg);
+
+  if (m->get_epoch() < pg->info.history.same_since) {
+    dout(10) << "handle_pg_log " << *pg 
+            << " from " << m->get_source() 
+            << " is old, discarding"
+            << endl;
+    delete m;
+    return;
+  }
+
+  dout(7) << "handle_pg_log " << *pg 
+          << " got " << m->log << " " << m->missing
+          << " from " << m->get_source() << endl;
+
+  //m->log.print(cout);
+  
+  ObjectStore::Transaction t;
+
+  if (pg->is_primary()) {
+    // i am PRIMARY
+    assert(pg->peer_log_requested.count(from) ||
+           pg->peer_summary_requested.count(from));
+    
+    pg->proc_replica_log(m->log, m->missing, from);
+
+    // peer
+    map< int, map<pg_t,PG::Query> > query_map;
+    pg->peer(t, query_map);
+    do_queries(query_map);
+
+  } else {
+    // i am REPLICA
+    dout(10) << *pg << " got " << m->log << " " << m->missing << endl;
+
+    // merge log
+    pg->merge_log(m->log, m->missing, from);
+    pg->proc_missing(m->log, m->missing, from);
+    assert(pg->missing.num_lost() == 0);
+
+    // ok activate!
+     pg->activate(t);
+  }
+
+  unsigned tr = store->apply_transaction(t);
+  assert(tr == 0);
+
+  _unlock_pg(pgid);
+
+  delete m;
+}
+
+
+/** PGQuery
+ * from primary to replica | stray
+ * NOTE: called with opqueue active.
+ */
+void OSD::handle_pg_query(MOSDPGQuery *m) 
+{
+  dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl;
+  int from = MSG_ADDR_NUM(m->get_source());
+  
+  if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+  map< int, list<PG::Info> > notify_list;
+  
+  for (map<pg_t,PG::Query>::iterator it = m->pg_list.begin();
+       it != m->pg_list.end();
+       it++) {
+    pg_t pgid = it->first;
+    PG *pg = 0;
+
+    if (pg_map.count(pgid) == 0) {
+      // same primary?
+      PG::Info::History history = it->second.history;
+      project_pg_history(pgid, history, m->get_epoch());
+
+      if (m->get_epoch() < history.same_since) {
+        dout(10) << " pg " << pgid << " dne, and pg has changed in "
+                 << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+        continue;
+      }
+
+      // get active rush mapping
+      vector<int> acting;
+      int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+      int role = osdmap->calc_pg_role(whoami, acting, nrep);
+
+      if (role < 0) {
+        dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl;
+        PG::Info empty(pgid);
+        notify_list[from].push_back(empty);
+        continue;
+      }
+      assert(role > 0);
+
+      ObjectStore::Transaction t;
+      pg = create_pg(pgid, t);
+      pg->acting.swap( acting );
+      pg->set_role(role);
+      pg->info.history = history;
+
+      t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
+      store->apply_transaction(t);
+
+      dout(10) << *pg << " dne (before), but i am role " << role << endl;
+      _lock_pg(pgid);
+    } else {
+      pg = _lock_pg(pgid);
+      
+      // same primary?
+      if (m->get_epoch() < pg->info.history.same_since) {
+        dout(10) << *pg << " handle_pg_query primary changed in "
+                 << pg->info.history.same_since
+                 << " (msg from " << m->get_epoch() << ")" << endl;
+        _unlock_pg(pgid);
+        continue;
+      }
+    }
+
+    // ok, process query!
+    assert(!pg->acting.empty());
+    assert(from == pg->acting[0]);
+
+    if (it->second.type == PG::Query::INFO) {
+      // info
+      dout(10) << *pg << " sending info" << endl;
+      notify_list[from].push_back(pg->info);
+    } else {
+      MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid());
+      m->info = pg->info;
+      m->missing = pg->missing;
+
+      if (it->second.type == PG::Query::LOG) {
+        dout(10) << *pg << " sending info+missing+log since split " << it->second.split
+                 << " from floor " << it->second.floor 
+                 << endl;
+        if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) {
+          dout(10) << *pg << "  divergent, sending backlog" << endl;
+          it->second.type = PG::Query::BACKLOG;
+        }
+      }
+
+      if (it->second.type == PG::Query::BACKLOG) {
+        dout(10) << *pg << " sending info+missing+backlog" << endl;
+        if (pg->log.backlog) {
+          m->log = pg->log;
+        } else {
+          pg->generate_backlog();
+          m->log = pg->log;
+          pg->drop_backlog();
+        }
+      } 
+      else if (it->second.type == PG::Query::FULLLOG) {
+        dout(10) << *pg << " sending info+missing+full log" << endl;
+        m->log.copy_non_backlog(pg->log);
+      }
+
+      dout(10) << *pg << " sending " << m->log << " " << m->missing << endl;
+      //m->log.print(cout);
+
+      _share_map_outgoing(MSG_ADDR_OSD(from), osdmap->get_inst(from));
+      messenger->send_message(m, MSG_ADDR_OSD(from), osdmap->get_inst(from));
+    }    
+
+    _unlock_pg(pgid);
+  }
+  
+  do_notifies(notify_list);   
+
+  delete m;
+}
+
+
+void OSD::handle_pg_remove(MOSDPGRemove *m)
+{
+  dout(7) << "handle_pg_remove from " << m->get_source() << endl;
+  
+  if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+  for (set<pg_t>::iterator it = m->pg_list.begin();
+       it != m->pg_list.end();
+       it++) {
+    pg_t pgid = *it;
+    PG *pg;
+
+    if (pg_map.count(pgid) == 0) {
+      dout(10) << " don't have pg " << pgid << endl;
+      continue;
+    }
+
+    pg = _lock_pg(pgid);
+
+    dout(10) << *pg << " removing." << endl;
+    assert(pg->get_role() == -1);
+    
+    _remove_pg(pgid);
+
+    // unlock.  there shouldn't be any waiters, since we're a stray, and pg is presumably clean0.
+    assert(pg_lock_waiters.count(pgid) == 0);
+    _unlock_pg(pgid);
+  }
+
+  delete m;
+}
+
+
+
+
+
+
+/*** RECOVERY ***/
+
+/** pull - request object from a peer
+ */
+void OSD::pull(PG *pg, object_t oid)
+{
+  assert(pg->missing.loc.count(oid));
+  eversion_t v = pg->missing.missing[oid];
+  int osd = pg->missing.loc[oid];
+  
+  dout(7) << *pg << " pull " << oid
+          << " v " << v 
+          << " from osd" << osd
+          << endl;
+
+  // send op
+  tid_t tid = ++last_tid;
+  MOSDOp *op = new MOSDOp(tid, messenger->get_myaddr(),
+                          oid, pg->get_pgid(),
+                          osdmap->get_epoch(),
+                          OSD_OP_PULL);
+  op->set_version(v);
+  messenger->send_message(op, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+  
+  // take note
+  assert(pg->objects_pulling.count(oid) == 0);
+  num_pulling++;
+  pg->objects_pulling[oid] = v;
+}
+
+
+/** push - send object to a peer
+ */
+void OSD::push(PG *pg, object_t oid, int dest)
+{
+  // read data+attrs
+  bufferlist bl;
+  eversion_t v;
+  int vlen = sizeof(v);
+  map<string,bufferptr> attrset;
+  
+  ObjectStore::Transaction t;
+  t.read(oid, 0, 0, &bl);
+  t.getattr(oid, "version", &v, &vlen);
+  t.getattrs(oid, attrset);
+  unsigned tr = store->apply_transaction(t);
+  
+  assert(tr == 0);  // !!!
+
+  // ok
+  dout(7) << *pg << " push " << oid << " v " << v 
+          << " size " << bl.length()
+          << " to osd" << dest
+          << endl;
+
+  logger->inc("r_push");
+  logger->inc("r_pushb", bl.length());
+  
+  // send
+  MOSDOp *op = new MOSDOp(++last_tid, MSG_ADDR_OSD(whoami),
+                          oid, pg->info.pgid, osdmap->get_epoch(), 
+                          OSD_OP_PUSH); 
+  op->set_offset(0);
+  op->set_length(bl.length());
+  op->set_data(bl);   // note: claims bl, set length above here!
+  op->set_version(v);
+  op->set_attrset(attrset);
+  
+  messenger->send_message(op, MSG_ADDR_OSD(dest), osdmap->get_inst(dest));
+}
+
+
+/** op_pull
+ * process request to pull an entire object.
+ * NOTE: called from opqueue.
+ */
+void OSD::op_pull(MOSDOp *op, PG *pg)
+{
+  const object_t oid = op->get_oid();
+  const eversion_t v = op->get_version();
+  int from = op->get_source().num();
+
+  dout(7) << *pg << " op_pull " << oid << " v " << op->get_version()
+          << " from " << op->get_source()
+          << endl;
+
+  // is a replica asking?  are they missing it?
+  if (pg->is_primary()) {
+    // primary
+    assert(pg->peer_missing.count(from));  // we had better know this, from the peering process.
+
+    if (!pg->peer_missing[from].is_missing(oid)) {
+      dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl;
+      delete op;
+      return;
+    }
+
+    // do we have it yet?
+    if (waitfor_missing_object(op, pg))
+      return;
+  } else {
+    // non-primary
+    if (pg->missing.is_missing(oid)) {
+      dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl;
+      delete op;
+      return;
+    }
+  }
+    
+  // push it back!
+  push(pg, oid, op->get_source().num());
+}
+
+
+/** op_push
+ * NOTE: called from opqueue.
+ */
+void OSD::op_push(MOSDOp *op, PG *pg)
+{
+  object_t oid = op->get_oid();
+  eversion_t v = op->get_version();
+
+  if (!pg->missing.is_missing(oid)) {
+    dout(7) << *pg << " op_push not missing " << oid << endl;
+    return;
+  }
+  
+  dout(7) << *pg << " op_push " 
+          << oid 
+          << " v " << v 
+          << " size " << op->get_length() << " " << op->get_data().length()
+          << endl;
+
+  assert(op->get_data().length() == op->get_length());
+  
+  // write object and add it to the PG
+  ObjectStore::Transaction t;
+  t.remove(oid);  // in case old version exists
+  t.write(oid, 0, op->get_length(), op->get_data());
+  t.setattrs(oid, op->get_attrset());
+  t.collection_add(pg->info.pgid, oid);
+
+  // close out pull op?
+  num_pulling--;
+  if (pg->objects_pulling.count(oid))
+    pg->objects_pulling.erase(oid);
+  pg->missing.got(oid, v);
+
+
+  // raise last_complete?
+  assert(pg->log.complete_to != pg->log.log.end());
+  while (pg->log.complete_to != pg->log.log.end()) {
+    if (pg->missing.missing.count(pg->log.complete_to->oid)) break;
+    if (pg->info.last_complete < pg->log.complete_to->version)
+      pg->info.last_complete = pg->log.complete_to->version;
+    pg->log.complete_to++;
+  }
+  dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl;
+  
+  
+  // apply to disk!
+  t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info));
+  unsigned r = store->apply_transaction(t);
+  assert(r == 0);
+
+
+
+  // am i primary?  are others missing this too?
+  if (pg->is_primary()) {
+    for (unsigned i=1; i<pg->acting.size(); i++) {
+      int peer = pg->acting[i];
+      assert(pg->peer_missing.count(peer));
+      if (pg->peer_missing[peer].is_missing(oid)) {
+        // ok, push it, and they (will) have it now.
+        pg->peer_missing[peer].got(oid, v);
+        push(pg, oid, peer);
+      }
+    }
+  }
+
+  // continue recovery
+  pg->do_recovery();
+  
+  // kick waiters
+  if (pg->waiting_for_missing_object.count(oid)) 
+    take_waiters(pg->waiting_for_missing_object[oid]);
+
+  delete op;
+}
+
+
+
+
+// op_rep_modify
+
+// commit (to disk) callback
+class C_OSD_RepModifyCommit : public Context {
+public:
+  OSD *osd;
+  MOSDOp *op;
+  int destosd;
+
+  eversion_t pg_last_complete;
+
+  Mutex lock;
+  Cond cond;
+  bool acked;
+  bool waiting;
+
+  C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) : 
+    osd(o), op(oo), destosd(dosd), pg_last_complete(lc),
+    acked(false), waiting(false) { }
+  void finish(int r) {
+    lock.Lock();
+    assert(!waiting);
+    while (!acked) {
+      waiting = true;
+      cond.Wait(lock);
+    }
+    assert(acked);
+    lock.Unlock();
+    osd->op_rep_modify_commit(op, destosd, pg_last_complete);
+  }
+  void ack() {
+    lock.Lock();
+    assert(!acked);
+    acked = true;
+    if (waiting) cond.Signal();
+
+    // discard my reference to buffer
+    op->get_data().clear();
+
+    lock.Unlock();
+  }
+};
+
+void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete)
+{
+  // send commit.
+  dout(10) << "rep_modify_commit on op " << *op
+           << ", sending commit to osd" << ackerosd
+           << endl;
+  MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
+  commit->set_pg_complete_thru(last_complete);
+  messenger->send_message(commit, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+  delete op;
+}
+
+// process a modification operation
+
+class C_OSD_WriteCommit : public Context {
+public:
+  OSD *osd;
+  pg_t pgid;
+  tid_t rep_tid;
+  eversion_t pg_last_complete;
+  C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {}
+  void finish(int r) {
+    osd->op_modify_commit(pgid, rep_tid, pg_last_complete);
+  }
+};
+
+
+/** op_rep_modify
+ * process a replicated modify.
+ * NOTE: called from opqueue.
+ */
+void OSD::op_rep_modify(MOSDOp *op, PG *pg)
+{ 
+  object_t oid = op->get_oid();
+  eversion_t nv = op->get_version();
+
+  const char *opname = MOSDOp::get_opname(op->get_op());
+
+  // check crev
+  objectrev_t crev = 0;
+  store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
+
+  dout(10) << "op_rep_modify " << opname 
+           << " " << oid 
+           << " v " << nv 
+           << " " << op->get_offset() << "~" << op->get_length()
+           << " in " << *pg
+           << endl;  
+  
+  // we better not be missing this.
+  assert(!pg->missing.is_missing(oid));
+
+  // prepare our transaction
+  ObjectStore::Transaction t;
+
+  // am i acker?
+  PG::RepOpGather *repop = 0;
+  int ackerosd = pg->acting[0];
+
+  if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) {
+    ackerosd = pg->get_acker();
+  
+    if (pg->is_acker()) {
+      // i am tail acker.
+      if (pg->repop_gather.count(op->get_rep_tid())) {
+        repop = pg->repop_gather[ op->get_rep_tid() ];
+      } else {
+        repop = new_repop_gather(pg, op);
+      }
+      
+      // infer ack from source
+      int fromosd = op->get_source().num();
+      get_repop_gather(repop);
+      {
+        //assert(repop->waitfor_ack.count(fromosd));   // no, we may come thru here twice.
+        repop->waitfor_ack.erase(fromosd);
+      }
+      put_repop_gather(pg, repop);
+
+      // prepare dest socket
+      //messenger->prepare_send_message(op->get_client());
+    }
+
+    // chain?  forward?
+    if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) {
+      // chain rep, not at the tail yet.
+      int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
+      int next = myrank+1;
+      if (next == (int)pg->acting.size())
+	next = 1;
+      issue_repop(pg, op, pg->acting[next]);	
+    }
+  }
+
+  // do op?
+  C_OSD_RepModifyCommit *oncommit = 0;
+
+  logger->inc("r_wr");
+  logger->inc("r_wrb", op->get_length());
+  
+  if (repop) {
+    // acker.  we'll apply later.
+    if (op->get_op() != OSD_OP_WRNOOP) {
+      prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
+      prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
+    }
+  } else {
+    // middle|replica.
+    if (op->get_op() != OSD_OP_WRNOOP) {
+      prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
+      prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
+    }
+
+    oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete);
+
+    // apply log update. and possibly update itself.
+    unsigned tr = store->apply_transaction(t, oncommit);
+    if (tr != 0 &&   // no errors
+        tr != 2) {   // or error on collection_add
+      cerr << "error applying transaction: r = " << tr << endl;
+      assert(tr == 0);
+    }
+  }
+  
+  // ack?
+  if (repop) {
+    // (logical) local ack.  this may induce the actual update.
+    get_repop_gather(repop);
+    {
+      assert(repop->waitfor_ack.count(whoami));
+      repop->waitfor_ack.erase(whoami);
+    }
+    put_repop_gather(pg, repop);
+  } 
+  else {
+    // send ack to acker?
+    if (g_conf.osd_rep != OSD_REP_CHAIN) {
+      MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false);
+      messenger->send_message(ack, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+    }
+
+    // ack myself.
+    assert(oncommit);
+    oncommit->ack(); 
+  }
+}
+
+
+// =========================================================
+// OPS
+
+void OSD::handle_op(MOSDOp *op)
+{
+  const pg_t pgid = op->get_pg();
+  PG *pg = get_pg(pgid);
+
+
+  logger->set("buf", buffer_total_alloc);
+
+  // update qlen stats
+  hb_stat_ops++;
+  hb_stat_qlen += pending_ops;
+
+
+  // require same or newer map
+  if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
+
+  // share our map with sender, if they're old
+  _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+
+  // what kind of op?
+  bool read = op->get_op() < 10;   // read, stat.  but not pull.
+
+  if (!op->get_source().is_osd()) {
+    // REGULAR OP (non-replication)
+
+    // note original source
+    op->set_client_inst( op->get_source_inst() );
+    op->clear_payload();    // and hose encoded payload (in case we forward)
+
+    // have pg?
+    if (!pg) {
+      dout(7) << "hit non-existent pg " 
+              << pgid 
+              << ", waiting" << endl;
+      waiting_for_pg[pgid].push_back(op);
+      return;
+    }
+    
+    if (read) {
+      // read. am i the (same) acker?
+      if (//pg->get_acker() != whoami ||
+          op->get_map_epoch() < pg->info.history.same_acker_since) {
+        dout(7) << "acting acker is osd" << pg->get_acker()
+                << " since " << pg->info.history.same_acker_since 
+                << ", dropping" << endl;
+        assert(op->get_map_epoch() < osdmap->get_epoch());
+        delete op;
+        return;
+      }
+    } else {
+      // write. am i the (same) primary?
+      if (pg->get_primary() != whoami ||
+          op->get_map_epoch() < pg->info.history.same_primary_since) {
+        dout(7) << "acting primary is osd" << pg->get_primary()
+                << " since " << pg->info.history.same_primary_since 
+                << ", dropping" << endl;
+        assert(op->get_map_epoch() < osdmap->get_epoch());
+        delete op;
+        return;
+      }
+    }
+    
+    // must be active.
+    if (!pg->is_active()) {
+      // replay?
+      if (op->get_version().version > 0) {
+        if (op->get_version() > pg->info.last_update) {
+          dout(7) << *pg << " queueing replay at " << op->get_version()
+                  << " for " << *op << endl;
+          pg->replay_queue[op->get_version()] = op;
+          return;
+        } else {
+          dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update 
+                  << " for " << *op
+                  << ", will queue for WRNOOP" << endl;
+        }
+      }
+      
+      dout(7) << *pg << " not active (yet)" << endl;
+      pg->waiting_for_active.push_back(op);
+      return;
+    }
+    
+    // missing object?
+    if (read && op->get_oid().rev > 0) {
+      // versioned read.  hrm.
+      // are we missing a revision that we might need?
+      object_t moid = op->get_oid();
+      if (pick_missing_object_rev(moid, pg)) {
+	// is there a local revision we might use instead?
+	object_t loid = op->get_oid();
+	if (store->pick_object_revision_lt(loid) &&
+	    moid <= loid) {
+	  // we need moid.  pull it.
+	  dout(10) << "handle_op read on " << op->get_oid()
+		   << ", have " << loid
+		   << ", but need missing " << moid
+		   << ", pulling" << endl;
+	  pull(pg, moid);
+	  pg->waiting_for_missing_object[moid].push_back(op);
+	  return;
+	} 
+	  
+	dout(10) << "handle_op read on " << op->get_oid()
+		 << ", have " << loid
+		 << ", don't need missing " << moid 
+		 << endl;
+      }
+    } else {
+      // live revision.  easy.
+      if (op->get_op() != OSD_OP_PUSH &&
+	  waitfor_missing_object(op, pg)) return;
+    }
+
+    dout(7) << "handle_op " << *op << " in " << *pg << endl;
+    
+    
+    // balance reads?
+    if (read &&
+	g_conf.osd_balance_reads &&
+	pg->get_acker() == whoami) {
+      // test
+      if (false) {
+	if (pg->acting.size() > 1) {
+	  int peer = pg->acting[1];
+	  dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl;
+	  messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+	  return;
+	}
+      }
+      
+      // am i above my average?
+      float my_avg = hb_stat_qlen / hb_stat_ops;
+      if (pending_ops > my_avg) {
+	// is there a peer who is below my average?
+	for (unsigned i=1; i<pg->acting.size(); ++i) {
+	  int peer = pg->acting[i];
+	  if (peer_qlen.count(peer) &&
+	      peer_qlen[peer] < my_avg) {
+	    // calculate a probability that we should redirect
+	    float p = (my_avg - peer_qlen[peer]) / my_avg;             // this is dumb.
+	    
+	    if (drand48() <= p) {
+	      // take the first one
+	      dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg
+			<< ", p=" << p 
+			<< ", fwd to peer w/ qlen " << peer_qlen[peer]
+			<< " osd" << peer
+			<< endl;
+	      messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+	      return;
+	    }
+	  }
+	}
+      }
+    }
+
+  } else {
+    // REPLICATION OP (it's from another OSD)
+
+    // have pg?
+    if (!pg) {
+      derr(-7) << "handle_rep_op " << *op 
+               << " pgid " << pgid << " dne" << endl;
+      delete op;
+      //assert(0); // wtf, shouldn't happen.
+      return;
+    }
+    
+    // check osd map: same set, or primary+acker?
+    if (g_conf.osd_rep == OSD_REP_CHAIN &&
+        op->get_map_epoch() < pg->info.history.same_since) {
+      dout(10) << "handle_rep_op pg changed " << pg->info.history
+               << " after " << op->get_map_epoch() 
+               << ", dropping" << endl;
+      delete op;
+      return;
+    }
+    if (g_conf.osd_rep != OSD_REP_CHAIN &&
+        (op->get_map_epoch() < pg->info.history.same_primary_since ||
+         op->get_map_epoch() < pg->info.history.same_acker_since)) {
+      dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history
+               << " after " << op->get_map_epoch() 
+               << ", dropping" << endl;
+      delete op;
+      return;
+    }
+
+    assert(pg->get_role() >= 0);
+    dout(7) << "handle_rep_op " << op << " in " << *pg << endl;
+  }
+  
+  if (g_conf.osd_maxthreads < 1) {
+    _lock_pg(pgid);
+    do_op(op, pg); // do it now
+    _unlock_pg(pgid);
+  } else {
+    // queue for worker threads
+    if (read) 
+      enqueue_op(0, op);     // no locking needed for reads
+    else 
+      enqueue_op(pgid, op);     
+  }
+}
+
+void OSD::handle_op_reply(MOSDOpReply *op)
+{
+  if (op->get_map_epoch() < boot_epoch) {
+    dout(3) << "replica op reply from before boot" << endl;
+    delete op;
+    return;
+  }
+
+  // must be a rep op.
+  assert(op->get_source().is_osd());
+  
+  // make sure we have the pg
+  const pg_t pgid = op->get_pg();
+  PG *pg = get_pg(pgid);
+
+  // require same or newer map
+  if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
+
+  // share our map with sender, if they're old
+  _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+
+  if (!pg) {
+    // hmm.
+    delete op;
+  }
+
+  if (g_conf.osd_maxthreads < 1) {
+    _lock_pg(pgid);
+    do_op(op, pg); // do it now
+    _unlock_pg(pgid);
+  } else {
+    enqueue_op(pgid, op);     // queue for worker threads
+  }
+}
+
+
+/*
+ * enqueue called with osd_lock held
+ */
+void OSD::enqueue_op(pg_t pgid, Message *op)
+{
+  while (pending_ops > g_conf.osd_max_opq) {
+    dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl;
+    op_queue_cond.Wait(osd_lock);
+  }
+
+  op_queue[pgid].push_back(op);
+  pending_ops++;
+  logger->set("opq", pending_ops);
+  
+  threadpool->put_op(pgid);
+}
+
+/*
+ * NOTE: dequeue called in worker thread, without osd_lock
+ */
+void OSD::dequeue_op(pg_t pgid)
+{
+  Message *op = 0;
+  PG *pg = 0;
+
+  osd_lock.Lock();
+  {
+    if (pgid) {
+      // lock pg
+      pg = _lock_pg(pgid);  
+    }
+
+    // get pending op
+    list<Message*> &ls  = op_queue[pgid];
+    assert(!ls.empty());
+    op = ls.front();
+    ls.pop_front();
+    
+    if (pgid) {
+      dout(10) << "dequeue_op " << op << " write pg " << pgid 
+               << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+    } else {
+      dout(10) << "dequeue_op " << op << " read "
+               << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+    }
+    
+    if (ls.empty())
+      op_queue.erase(pgid);
+  }
+  osd_lock.Unlock();
+
+  // do it
+  do_op(op, pg);
+
+  // finish
+  osd_lock.Lock();
+  {
+    if (pgid) {
+      // unlock pg
+      _unlock_pg(pgid);
+    }
+    
+    dout(10) << "dequeue_op " << op << " finish" << endl;
+    assert(pending_ops > 0);
+    
+    if (pending_ops > g_conf.osd_max_opq) 
+      op_queue_cond.Signal();
+    
+    pending_ops--;
+    logger->set("opq", pending_ops);
+    if (pending_ops == 0 && waiting_for_no_ops)
+      no_pending_ops.Signal();
+  }
+  osd_lock.Unlock();
+}
+
+
+
+/** do_op - do an op
+ * object lock will be held (if multithreaded)
+ * osd_lock NOT held.
+ */
+void OSD::do_op(Message *m, PG *pg) 
+{
+  //dout(15) << "do_op " << *m << endl;
+
+  if (m->get_type() == MSG_OSD_OP) {
+    MOSDOp *op = (MOSDOp*)m;
+
+    logger->inc("op");
+
+    switch (op->get_op()) {
+      
+      // reads
+    case OSD_OP_READ:
+      op_read(op);//, pg);
+      break;
+    case OSD_OP_STAT:
+      op_stat(op);//, pg);
+      break;
+      
+      // rep stuff
+    case OSD_OP_PULL:
+      op_pull(op, pg);
+      break;
+    case OSD_OP_PUSH:
+      op_push(op, pg);
+      break;
+      
+      // writes
+    case OSD_OP_WRNOOP:
+    case OSD_OP_WRITE:
+    case OSD_OP_ZERO:
+    case OSD_OP_DELETE:
+    case OSD_OP_TRUNCATE:
+    case OSD_OP_WRLOCK:
+    case OSD_OP_WRUNLOCK:
+    case OSD_OP_RDLOCK:
+    case OSD_OP_RDUNLOCK:
+    case OSD_OP_UPLOCK:
+    case OSD_OP_DNLOCK:
+      if (op->get_source().is_osd()) 
+        op_rep_modify(op, pg);
+      else
+        op_modify(op, pg);
+      break;
+      
+    default:
+      assert(0);
+    }
+  } 
+  else if (m->get_type() == MSG_OSD_OPREPLY) {
+    // must be replication.
+    MOSDOpReply *r = (MOSDOpReply*)m;
+    tid_t rep_tid = r->get_rep_tid();
+  
+    if (pg->repop_gather.count(rep_tid)) {
+      // oh, good.
+      int fromosd = r->get_source().num();
+      repop_ack(pg, pg->repop_gather[rep_tid], 
+                r->get_result(), r->get_commit(), 
+                fromosd, 
+                r->get_pg_complete_thru());
+      delete m;
+    } else {
+      // early ack.
+      pg->waiting_for_repop[rep_tid].push_back(r);
+    }
+
+  } else
+    assert(0);
+}
+
+
+
+void OSD::wait_for_no_ops()
+{
+  if (pending_ops > 0) {
+    dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl;
+    waiting_for_no_ops = true;
+    while (pending_ops > 0)
+      no_pending_ops.Wait(osd_lock);
+    waiting_for_no_ops = false;
+    assert(pending_ops == 0);
+  } 
+  dout(7) << "wait_for_no_ops - none" << endl;
+}
+
+
+// ==============================
+// Object locking
+
+//
+// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock
+// returns true if object was locked, otherwise returns false
+// 
+bool OSD::block_if_wrlocked(MOSDOp* op)
+{
+  object_t oid = op->get_oid();
+
+  msg_addr_t source;
+  int len = store->getattr(oid, "wrlock", &source, sizeof(msg_addr_t));
+  //cout << "getattr returns " << len << " on " << oid << endl;
+
+  if (len == sizeof(source) &&
+      source != op->get_client()) {
+    //the object is locked for writing by someone else -- add the op to the waiting queue      
+    waiting_for_wr_unlock[oid].push_back(op);
+    return true;
+  }
+
+  return false; //the object wasn't locked, so the operation can be handled right away
+}
+
+
+
+// ===============================
+// OPS
+
+/*
+int OSD::list_missing_revs(object_t oid, set<object_t>& revs, PG *pg)
+{
+  int c = 0;
+  oid.rev = 0;
+  
+  map<object_t,eversion_t>::iterator p = pg->missing.missing.lower_bound(oid);
+  if (p == pg->missing.missing.end()) 
+    return 0;  // clearly not
+
+  while (p->first.ino == oid.ino &&
+	 p->first.bno == oid.bno) {
+    revs.insert(p->first);
+    c++;
+  }
+  return c;
+}*/
+
+bool OSD::pick_missing_object_rev(object_t& oid, PG *pg)
+{
+  map<object_t,eversion_t>::iterator p = pg->missing.missing.upper_bound(oid);
+  if (p == pg->missing.missing.end()) 
+    return false;  // clearly no candidate
+
+  if (p->first.ino == oid.ino && p->first.bno == oid.bno) {
+    oid = p->first;  // yes!  it's an upper bound revision for me.
+    return true;
+  }
+  return false;
+}
+
+bool OSD::pick_object_rev(object_t& oid)
+{
+  object_t t = oid;
+
+  if (!store->pick_object_revision_lt(t))
+    return false; // we have no revisions of this object!
+  
+  objectrev_t crev;
+  int r = store->getattr(t, "crev", &crev, sizeof(crev));
+  assert(r >= 0);
+  if (crev <= oid.rev) {
+    dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl;
+    oid = t;
+    return true;
+  }
+
+  return false;  
+}
+
+bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg)
+{
+  const object_t oid = op->get_oid();
+
+  // are we missing the object?
+  if (pg->missing.missing.count(oid)) {
+    // we don't have it (yet).
+    eversion_t v = pg->missing.missing[oid];
+    if (pg->objects_pulling.count(oid)) {
+      dout(7) << "missing "
+              << oid 
+              << " v " << v
+              << " in " << *pg
+              << ", already pulling"
+              << endl;
+    } else {
+      dout(7) << "missing " 
+              << oid 
+              << " v " << v
+              << " in " << *pg
+              << ", pulling"
+              << endl;
+      pull(pg, oid);
+    }
+    pg->waiting_for_missing_object[oid].push_back(op);
+    return true;
+  }
+
+  return false;
+}
+
+
+
+
+// READ OPS
+
+/** op_read
+ * client read op
+ * NOTE: called from opqueue.
+ */
+void OSD::op_read(MOSDOp *op)//, PG *pg)
+{
+  object_t oid = op->get_oid();
+  
+  // if the target object is locked for writing by another client, put 'op' to the waiting queue
+  // for _any_ op type -- eg only the locker can unlock!
+  if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks
+ 
+  dout(10) << "op_read " << oid 
+           << " " << op->get_offset() << "~" << op->get_length() 
+    //<< " in " << *pg 
+           << endl;
+
+  long r = 0;
+  bufferlist bl;
+  
+  if (oid.rev && !pick_object_rev(oid)) {
+    // we have no revision for this request.
+    r = -EEXIST;
+  } else {
+    // read into a buffer
+    r = store->read(oid, 
+		    op->get_offset(), op->get_length(),
+		    bl);
+  }
+  
+  // set up reply
+  MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); 
+  if (r >= 0) {
+    reply->set_result(0);
+    reply->set_data(bl);
+    reply->set_length(r);
+      
+    logger->inc("c_rd");
+    logger->inc("c_rdb", r);
+    
+  } else {
+    reply->set_result(r);   // error
+    reply->set_length(0);
+  }
+  
+  dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl;
+  
+  logger->inc("rd");
+  if (r >= 0) logger->inc("rdb", r);
+  
+  // send it
+  messenger->send_message(reply, op->get_client(), op->get_client_inst());
+  
+  delete op;
+}
+
+
+/** op_stat
+ * client stat
+ * NOTE: called from opqueue
+ */
+void OSD::op_stat(MOSDOp *op)//, PG *pg)
+{
+  object_t oid = op->get_oid();
+
+  // if the target object is locked for writing by another client, put 'op' to the waiting queue
+  if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks
+
+  struct stat st;
+  memset(&st, sizeof(st), 0);
+  int r = 0;
+
+  if (oid.rev && !pick_object_rev(oid)) {
+    // we have no revision for this request.
+    r = -EEXIST;
+  } else {
+    r = store->stat(oid, &st);
+  }
+  
+  dout(3) << "op_stat on " << oid 
+          << " r = " << r
+          << " size = " << st.st_size
+    //<< " in " << *pg
+          << endl;
+  
+  MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true);
+  reply->set_object_size(st.st_size);
+  messenger->send_message(reply, op->get_client(), op->get_client_inst());
+  
+  logger->inc("stat");
+
+  delete op;
+}
+
+
+
+/*********
+ * new repops
+ */
+
+void OSD::get_repop_gather(PG::RepOpGather *repop)
+{
+  //repop->lock.Lock();
+  dout(10) << "get_repop " << *repop << endl;
+}
+
+void OSD::apply_repop(PG *pg, PG::RepOpGather *repop)
+{
+  dout(10) << "apply_repop  applying update on " << *repop << endl;
+  assert(!repop->applied);
+
+  Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete);
+  unsigned r = store->apply_transaction(repop->t, oncommit);
+  if (r)
+    dout(-10) << "apply_repop  apply transaction return " << r << " on " << *repop << endl;
+  
+  // discard my reference to buffer
+  repop->op->get_data().clear();
+
+  repop->applied = true;
+}
+
+void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop)
+{
+  dout(10) << "put_repop " << *repop << endl;
+
+  // commit?
+  if (repop->can_send_commit() &&
+      repop->op->wants_commit()) {
+    // send commit.
+    MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true);
+    dout(10) << "put_repop  sending commit on " << *repop << " " << reply << endl;
+    messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+    repop->sent_commit = true;
+  }
+
+  // ack?
+  else if (repop->can_send_ack() &&
+           repop->op->wants_ack()) {
+    // apply
+    apply_repop(pg, repop);
+
+    // send ack
+    MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false);
+    dout(10) << "put_repop  sending ack on " << *repop << " " << reply << endl;
+    messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+    repop->sent_ack = true;
+
+    utime_t now = g_clock.now();
+    now -= repop->start;
+    logger->finc("rlsum", now);
+    logger->inc("rlnum", 1);
+  }
+
+  // done.
+  if (repop->can_delete()) {
+    // adjust peers_complete_thru
+    if (!repop->pg_complete_thru.empty()) {
+      eversion_t min = pg->info.last_complete;  // hrm....
+      for (unsigned i=0; i<pg->acting.size(); i++) {
+        if (repop->pg_complete_thru[pg->acting[i]] < min)      // note: if we haven't heard, it'll be zero, which is what we want.
+          min = repop->pg_complete_thru[pg->acting[i]];
+      }
+      
+      if (min > pg->peers_complete_thru) {
+        dout(10) << "put_repop  peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl;
+        pg->peers_complete_thru = min;
+      }
+    }
+
+    dout(10) << "put_repop  deleting " << *repop << endl;
+    //repop->lock.Unlock();  
+
+    assert(pg->repop_gather.count(repop->rep_tid));
+    pg->repop_gather.erase(repop->rep_tid);
+
+    delete repop->op;
+    delete repop;
+
+  } else {
+    //repop->lock.Unlock();
+  }
+}
+
+
+void OSD::issue_repop(PG *pg, MOSDOp *op, int osd)
+{
+  object_t oid = op->get_oid();
+
+  dout(7) << " issue_repop rep_tid " << op->get_rep_tid()
+          << " in " << *pg 
+          << " o " << oid
+          << " to osd" << osd
+          << endl;
+  
+  // forward the write/update/whatever
+  MOSDOp *wr = new MOSDOp(op->get_tid(),
+                          op->get_client(),
+                          oid,
+                          pg->get_pgid(),
+                          osdmap->get_epoch(),
+                          op->get_op());
+  wr->get_data() = op->get_data();   // _copy_ bufferlist
+  wr->set_length(op->get_length());
+  wr->set_offset(op->get_offset());
+  wr->set_version(op->get_version());
+
+  wr->set_rep_tid(op->get_rep_tid());
+  wr->set_pg_trim_to(pg->peers_complete_thru);
+
+  messenger->send_message(wr, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+}
+
+PG::RepOpGather *OSD::new_repop_gather(PG *pg, 
+                                       MOSDOp *op)
+{
+  dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl;
+
+  PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(), 
+                                               op->get_version(), 
+                                               pg->info.last_complete);
+
+  // osds. commits all come to me.
+  for (unsigned i=0; i<pg->acting.size(); i++) {
+    int osd = pg->acting[i];
+    repop->osds.insert(osd);
+    repop->waitfor_commit.insert(osd);
+  }
+
+  // acks vary:
+  if (g_conf.osd_rep == OSD_REP_CHAIN) {
+    // chain rep. 
+    // there's my local ack...
+    repop->osds.insert(whoami);
+    repop->waitfor_ack.insert(whoami);
+    repop->waitfor_commit.insert(whoami);
+
+    // also, the previous guy will ack to me
+    int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
+    if (myrank > 0) {
+      int osd = pg->acting[ myrank-1 ];
+      repop->osds.insert(osd);
+      repop->waitfor_ack.insert(osd);
+      repop->waitfor_commit.insert(osd);
+    }
+  } else {
+    // primary, splay.  all osds ack to me.
+    for (unsigned i=0; i<pg->acting.size(); i++) {
+      int osd = pg->acting[i];
+      repop->waitfor_ack.insert(osd);
+    }
+  }
+
+  repop->start = g_clock.now();
+
+  pg->repop_gather[ repop->rep_tid ] = repop;
+
+  // anyone waiting?  (acks that got here before the op did)
+  if (pg->waiting_for_repop.count(repop->rep_tid)) {
+    take_waiters(pg->waiting_for_repop[repop->rep_tid]);
+    pg->waiting_for_repop.erase(repop->rep_tid);
+  }
+
+  return repop;
+}
+ 
+
+void OSD::repop_ack(PG *pg, PG::RepOpGather *repop,
+                    int result, bool commit,
+                    int fromosd, eversion_t pg_complete_thru)
+{
+  MOSDOp *op = repop->op;
+
+  dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op
+          << " result " << result << " commit " << commit << " from osd" << fromosd
+          << " in " << *pg
+          << endl;
+
+  get_repop_gather(repop);
+  {
+    if (commit) {
+      // commit
+      assert(repop->waitfor_commit.count(fromosd));      
+      repop->waitfor_commit.erase(fromosd);
+      repop->waitfor_ack.erase(fromosd);
+      repop->pg_complete_thru[fromosd] = pg_complete_thru;
+    } else {
+      // ack
+      repop->waitfor_ack.erase(fromosd);
+    }
+  }
+  put_repop_gather(pg, repop);
+}
+
+
+
+
+
+/** op_modify_commit
+ * transaction commit on the acker.
+ */
+void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru)
+{
+  PG *pg = lock_pg(pgid);
+  if (pg) {
+    if (pg->repop_gather.count(rep_tid)) {
+      PG::RepOpGather *repop = pg->repop_gather[rep_tid];
+      
+      dout(10) << "op_modify_commit " << *repop->op << endl;
+      get_repop_gather(repop);
+      {
+        assert(repop->waitfor_commit.count(whoami));
+        repop->waitfor_commit.erase(whoami);
+        repop->pg_complete_thru[whoami] = pg_complete_thru;
+      }
+      put_repop_gather(pg, repop);
+      dout(10) << "op_modify_commit done on " << repop << endl;
+    } else {
+      dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl;
+    }
+
+    unlock_pg(pgid);
+  } else {
+    dout(10) << "op_modify_commit pg " << pgid << " dne" << endl;
+  }
+}
+
+
+/** op_modify
+ * process client modify op
+ * NOTE: called from opqueue.
+ */
+void OSD::op_modify(MOSDOp *op, PG *pg)
+{
+  object_t oid = op->get_oid();
+
+  const char *opname = MOSDOp::get_opname(op->get_op());
+
+  // are any peers missing this?
+  for (unsigned i=1; i<pg->acting.size(); i++) {
+    int peer = pg->acting[i];
+    if (pg->peer_missing.count(peer) &&
+        pg->peer_missing[peer].is_missing(oid)) {
+      // push it before this update. 
+      // FIXME, this is probably extra much work (eg if we're about to overwrite)
+      pg->peer_missing[peer].got(oid);
+      push(pg, oid, peer);
+    }
+  }
+
+  // dup op?
+  reqid_t reqid(op->get_client(), op->get_tid());
+  if (pg->log.logged_req(reqid)) {
+    dout(-3) << "op_modify " << opname << " dup op " << reqid
+             << ", doing WRNOOP" << endl;
+    op->set_op(OSD_OP_WRNOOP);
+    opname = MOSDOp::get_opname(op->get_op());
+  }
+
+  // locked by someone else?
+  // for _any_ op type -- eg only the locker can unlock!
+  if (op->get_op() != OSD_OP_WRNOOP &&  // except WRNOOP; we just want to flush
+      block_if_wrlocked(op)) 
+    return; // op will be handled later, after the object unlocks
+
+
+  // check crev
+  objectrev_t crev = 0;
+  store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
+
+  // assign version
+  eversion_t clone_version;
+  eversion_t nv = pg->log.top;
+  if (op->get_op() != OSD_OP_WRNOOP) {
+    nv.epoch = osdmap->get_epoch();
+    nv.version++;
+    assert(nv > pg->info.last_update);
+    assert(nv > pg->log.top);
+
+    // will clone?
+    if (crev && op->get_rev() && op->get_rev() > crev) {
+      clone_version = nv;
+      nv.version++;
+    }
+
+    if (op->get_version().version) {
+      // replay!
+      if (nv.version < op->get_version().version) {
+        nv.version = op->get_version().version; 
+
+	// clone?
+	if (crev && op->get_rev() && op->get_rev() > crev) {
+	  // backstep clone
+	  clone_version = nv;
+	  clone_version.version--;
+	}
+      }
+    }
+  }
+
+  // set version in op, for benefit of client and our eventual reply
+  op->set_version(nv);
+  
+  dout(10) << "op_modify " << opname 
+           << " " << oid 
+           << " v " << nv 
+	   << " crev " << crev
+	   << " rev " << op->get_rev()
+           << " " << op->get_offset() << "~" << op->get_length()
+           << endl;  
+
+  if (op->get_op() == OSD_OP_WRITE) {
+    logger->inc("c_wr");
+    logger->inc("c_wrb", op->get_length());
+  }
+
+  // share latest osd map?
+  osd_lock.Lock();
+  {
+    for (unsigned i=1; i<pg->acting.size(); i++) {
+      int osd = pg->acting[i];
+      _share_map_outgoing( MSG_ADDR_OSD(osd), osdmap->get_inst(osd) ); 
+    }
+  }
+  osd_lock.Unlock();
+
+  // issue replica writes
+  PG::RepOpGather *repop = 0;
+  bool alone = (pg->acting.size() == 1);
+  tid_t rep_tid = ++last_tid;
+  op->set_rep_tid(rep_tid);
+
+  if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) {
+    // chain rep.  send to #2 only.
+    int next = pg->acting[1];
+    if (pg->acting.size() > 2)
+      next = pg->acting[2];
+    issue_repop(pg, op, next);
+  } 
+  else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) {
+    // splay rep.  send to rest.
+    for (unsigned i=1; i<pg->acting.size(); ++i)
+    //for (unsigned i=pg->acting.size()-1; i>=1; --i)
+      issue_repop(pg, op, pg->acting[i]);
+  } else {
+    // primary rep, or alone.
+    repop = new_repop_gather(pg, op);
+
+    // send to rest.
+    if (!alone)
+      for (unsigned i=1; i<pg->acting.size(); i++)
+        issue_repop(pg, op, pg->acting[i]);
+  }
+
+  if (repop) {    
+    // we are acker.
+    if (op->get_op() != OSD_OP_WRNOOP) {
+      // log and update later.
+      prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
+      prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
+    }
+
+    // (logical) local ack.
+    // (if alone, this will apply the update.)
+    get_repop_gather(repop);
+    {
+      assert(repop->waitfor_ack.count(whoami));
+      repop->waitfor_ack.erase(whoami);
+    }
+    put_repop_gather(pg, repop);
+
+  } else {
+    // chain or splay.  apply.
+    ObjectStore::Transaction t;
+    prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
+    prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
+
+    C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(), 
+                                                                pg->info.last_complete);
+    unsigned r = store->apply_transaction(t, oncommit);
+    if (r != 0 &&   // no errors
+        r != 2) {   // or error on collection_add
+      cerr << "error applying transaction: r = " << r << endl;
+      assert(r == 0);
+    }
+
+    oncommit->ack();
+  }
+}
+
+
+
+void OSD::prepare_log_transaction(ObjectStore::Transaction& t, 
+                                  MOSDOp *op, eversion_t& version, 
+				  objectrev_t crev, objectrev_t rev,
+				  PG *pg,
+                                  eversion_t trim_to)
+{
+  const object_t oid = op->get_oid();
+
+  // clone entry?
+  if (crev && rev && rev > crev) {
+    eversion_t cv = version;
+    cv.version--;
+    PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv,
+			    op->get_client(), op->get_tid());
+    pg->log.add(cloneentry);
+
+    dout(10) << "prepare_log_transaction " << op->get_op()
+	     << " " << cloneentry
+	     << " in " << *pg << endl;
+  }
+
+  // actual op
+  int opcode = PG::Log::Entry::MODIFY;
+  if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE;
+  PG::Log::Entry logentry(opcode, oid, version,
+                          op->get_client(), op->get_tid());
+
+  dout(10) << "prepare_log_transaction " << op->get_op()
+           << " " << logentry
+           << " in " << *pg << endl;
+
+  // append to log
+  assert(version > pg->log.top);
+  pg->log.add(logentry);
+  assert(pg->log.top == version);
+  dout(10) << "prepare_log_transaction appended to " << *pg << endl;
+
+  // write to pg log on disk
+  pg->append_log(t, logentry, trim_to);
+}
+
+
+/** prepare_op_transaction
+ * apply an op to the store wrapped in a transaction.
+ */
+void OSD::prepare_op_transaction(ObjectStore::Transaction& t, 
+                                 MOSDOp *op, eversion_t& version, 
+				 objectrev_t crev, objectrev_t rev,
+				 PG *pg)
+{
+  const object_t oid = op->get_oid();
+  const pg_t pgid = op->get_pg();
+
+  bool did_clone = false;
+
+  dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() )
+           << " " << oid 
+           << " v " << version
+	   << " crev " << crev
+	   << " rev " << rev
+           << " in " << *pg << endl;
+  
+  // WRNOOP does nothing.
+  if (op->get_op() == OSD_OP_WRNOOP) 
+    return;
+
+  // raise last_complete?
+  if (pg->info.last_complete == pg->info.last_update)
+    pg->info.last_complete = version;
+  
+  // raise last_update.
+  assert(version > pg->info.last_update);
+  pg->info.last_update = version;
+  
+  // write pg info
+  t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info));
+
+  // clone?
+  if (crev && rev && rev > crev) {
+    object_t noid = oid;
+    noid.rev = rev;
+    dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl;
+    t.clone(oid, noid);
+    did_clone = true;
+  }  
+
+  // apply the op
+  switch (op->get_op()) {
+  case OSD_OP_WRLOCK:
+    { // lock object
+      //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit);
+      t.setattr(oid, "wrlock", &op->get_client(), sizeof(msg_addr_t));
+    }
+    break;  
+    
+  case OSD_OP_WRUNLOCK:
+    { // unlock objects
+      //r = store->rmattr(oid, "wrlock", oncommit);
+      t.rmattr(oid, "wrlock");
+      
+      // unblock all operations that were waiting for this object to become unlocked
+      if (waiting_for_wr_unlock.count(oid)) {
+        take_waiters(waiting_for_wr_unlock[oid]);
+        waiting_for_wr_unlock.erase(oid);
+      }
+    }
+    break;
+    
+  case OSD_OP_WRITE:
+    { // write
+      assert(op->get_data().length() == op->get_length());
+      bufferlist bl;
+      bl.claim( op->get_data() );  // give buffers to store; we keep *op in memory for a long time!
+      
+      //if (oid < 100000000000000ULL)  // hack hack-- don't write client data
+      t.write( oid, op->get_offset(), op->get_length(), bl );
+    }
+    break;
+    
+  case OSD_OP_ZERO:
+    {
+      assert(0);  // are you sure this is what you want?
+      // zero, remove, or truncate?
+      struct stat st;
+      int r = store->stat(oid, &st);
+      if (r >= 0) {
+	if (op->get_offset() + op->get_length() >= st.st_size) {
+	  if (op->get_offset()) 
+	    t.truncate(oid, op->get_length() + op->get_offset());
+	  else
+	    t.remove(oid);
+	} else {
+	  // zero.  the dumb way.  FIXME.
+	  bufferptr bp(op->get_length());
+	  bp.zero();
+	  bufferlist bl;
+	  bl.push_back(bp);
+	  t.write(oid, op->get_offset(), op->get_length(), bl);
+	}
+      } else {
+	// noop?
+	dout(10) << "apply_transaction zero on " << oid << ", but dne?  stat returns " << r << endl;
+      }
+    }
+    break;
+
+  case OSD_OP_TRUNCATE:
+    { // truncate
+      //r = store->truncate(oid, op->get_offset());
+      t.truncate(oid, op->get_length() );
+    }
+    break;
+    
+  case OSD_OP_DELETE:
+    { // delete
+      //r = store->remove(oid);
+      t.remove(oid);
+    }
+    break;
+    
+  default:
+    assert(0);
+  }
+  
+  // object collection, version
+  if (op->get_op() == OSD_OP_DELETE) {
+    // remove object from c
+    t.collection_remove(pgid, oid);
+  } else {
+    // add object to c
+    t.collection_add(pgid, oid);
+    
+    // object version
+    t.setattr(oid, "version", &version, sizeof(version));
+
+    // set object crev
+    if (crev == 0 ||   // new object
+	did_clone)     // we cloned
+      t.setattr(oid, "crev", &rev, sizeof(rev));
+  }
+}
diff --git a/branches/sage/cephmds2/osd/OSD.h b/branches/sage/cephmds2/osd/OSD.h
new file mode 100644
index 0000000000000..ae5dba7a8e01a
--- /dev/null
+++ b/branches/sage/cephmds2/osd/OSD.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __OSD_H
+#define __OSD_H
+
+#include "msg/Dispatcher.h"
+
+#include "common/Mutex.h"
+#include "common/ThreadPool.h"
+
+#include "mon/MonMap.h"
+
+#include "ObjectStore.h"
+#include "PG.h"
+
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+#include "messages/MOSDOp.h"
+
+class Messenger;
+class Message;
+
+
+  
+
+class OSD : public Dispatcher {
+public:
+
+  /** superblock
+   */
+  OSDSuperblock superblock;
+  epoch_t  boot_epoch;      
+
+  object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); }
+  object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); }
+  
+  void write_superblock();
+  void write_superblock(ObjectStore::Transaction& t);
+  int read_superblock();
+
+
+  /** OSD **/
+ protected:
+  Messenger *messenger;
+  int whoami;
+
+  static const int STATE_BOOTING = 1;
+  static const int STATE_ACTIVE = 2;
+  static const int STATE_STOPPING = 3;
+
+  int state;
+
+  bool is_booting() { return state == STATE_BOOTING; }
+  bool is_active() { return state == STATE_ACTIVE; }
+  bool is_stopping() { return state == STATE_STOPPING; }
+
+
+  MonMap *monmap;
+
+  class Logger      *logger;
+
+  // local store
+  char dev_path[100];
+  class ObjectStore *store;
+
+  // heartbeat
+  void heartbeat();
+
+  class C_Heartbeat : public Context {
+    OSD *osd;
+  public:
+    C_Heartbeat(OSD *o) : osd(o) {}
+    void finish(int r) {
+      osd->heartbeat();
+    }
+  } *next_heartbeat;
+
+  // global lock
+  Mutex osd_lock;
+
+  // -- stats --
+  int hb_stat_ops;  // ops since last heartbeat
+  int hb_stat_qlen; // cumulative queue length since last hb
+
+  hash_map<int, float> peer_qlen;
+  
+  // per-pg locking (serializing)
+  hash_set<pg_t>               pg_lock;
+  hash_map<pg_t, list<Cond*> > pg_lock_waiters;  
+  PG *lock_pg(pg_t pgid);
+  PG *_lock_pg(pg_t pgid);
+  void unlock_pg(pg_t pgid);
+  void _unlock_pg(pg_t pgid);
+
+  // finished waiting messages, that will go at tail of dispatch()
+  list<class Message*> finished;
+  void take_waiters(list<class Message*>& ls) {
+    finished.splice(finished.end(), ls);
+  }
+  
+  // object locking
+  hash_map<object_t, list<Message*> > waiting_for_wr_unlock; /** list of operations for each object waiting for 'wrunlock' */
+
+  bool block_if_wrlocked(MOSDOp* op);
+
+  // -- ops --
+  class ThreadPool<class OSD*, pg_t>   *threadpool;
+  hash_map<pg_t, list<Message*> >       op_queue;
+  int   pending_ops;
+  bool  waiting_for_no_ops;
+  Cond  no_pending_ops;
+  Cond  op_queue_cond;
+  
+  void wait_for_no_ops();
+
+  void enqueue_op(pg_t pgid, Message *op);
+  void dequeue_op(pg_t pgid);
+  static void static_dequeueop(OSD *o, pg_t pgid) {
+    o->dequeue_op(pgid);
+  };
+
+  void do_op(Message *m, PG *pg);  // actually do it
+
+  void prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, 
+			       objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to);
+  void prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, 
+			      objectrev_t crev, objectrev_t rev, PG *pg);
+  
+  bool waitfor_missing_object(MOSDOp *op, PG *pg);
+  bool pick_missing_object_rev(object_t& oid, PG *pg);
+  bool pick_object_rev(object_t& oid);
+
+
+  
+ friend class PG;
+
+ protected:
+
+  // -- osd map --
+  class OSDMap  *osdmap;
+  list<class Message*> waiting_for_osdmap;
+
+  hash_map<msg_addr_t, epoch_t>  peer_map_epoch;
+  bool _share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch);
+  void _share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst);
+
+  void wait_for_new_map(Message *m);
+  void handle_osd_map(class MOSDMap *m);
+  
+  void advance_map(ObjectStore::Transaction& t);
+  void activate_map(ObjectStore::Transaction& t);
+
+  void get_map(epoch_t e, OSDMap &m);
+  bool get_map_bl(epoch_t e, bufferlist& bl);
+  bool get_inc_map_bl(epoch_t e, bufferlist& bl);
+  bool get_inc_map(epoch_t e, OSDMap::Incremental &inc);
+  
+  void send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full);
+
+
+
+  // -- replication --
+
+  // PG
+  hash_map<pg_t, PG*>      pg_map;
+  void  load_pgs();
+  bool  pg_exists(pg_t pg);
+  PG   *create_pg(pg_t pg, ObjectStore::Transaction& t);          // create new PG
+  PG   *get_pg(pg_t pg);             // return existing PG, or null
+  void  _remove_pg(pg_t pg);         // remove from store and memory
+
+  void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from);
+
+  void activate_pg(pg_t pgid, epoch_t epoch);
+
+  class C_Activate : public Context {
+    OSD *osd;
+    pg_t pgid;
+    epoch_t epoch;
+  public:
+    C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {}
+    void finish(int r) {
+      osd->activate_pg(pgid, epoch);
+    }
+  };
+
+
+  tid_t               last_tid;
+  int                 num_pulling;
+
+  hash_map<pg_t, list<Message*> >        waiting_for_pg;
+
+  // replica ops
+  void get_repop_gather(PG::RepOpGather*);
+  void apply_repop(PG *pg, PG::RepOpGather *repop);
+  void put_repop_gather(PG *pg, PG::RepOpGather*);
+  void issue_repop(PG *pg, MOSDOp *op, int osd);
+  PG::RepOpGather *new_repop_gather(PG *pg, MOSDOp *op);
+  void repop_ack(PG *pg, PG::RepOpGather *repop,
+                 int result, bool commit,
+                 int fromosd, eversion_t pg_complete_thru=0);
+  
+  void handle_rep_op_ack(MOSDOpReply *m);
+
+  // recovery
+  void do_notifies(map< int, list<PG::Info> >& notify_list);
+  void do_queries(map< int, map<pg_t,PG::Query> >& query_map);
+  void repeer(PG *pg, map< int, map<pg_t,PG::Query> >& query_map);
+
+  void pull(PG *pg, object_t oid);
+  void push(PG *pg, object_t oid, int dest);
+
+  bool require_current_map(Message *m, epoch_t v);
+  bool require_same_or_newer_map(Message *m, epoch_t e);
+
+  void handle_pg_query(class MOSDPGQuery *m);
+  void handle_pg_notify(class MOSDPGNotify *m);
+  void handle_pg_log(class MOSDPGLog *m);
+  void handle_pg_remove(class MOSDPGRemove *m);
+
+  void op_pull(class MOSDOp *op, PG *pg);
+  void op_push(class MOSDOp *op, PG *pg);
+  
+  void op_rep_modify(class MOSDOp *op, PG *pg);   // write, trucnate, delete
+  void op_rep_modify_commit(class MOSDOp *op, int ackerosd, 
+                            eversion_t last_complete);
+  friend class C_OSD_RepModifyCommit;
+
+
+ public:
+  OSD(int id, Messenger *m, MonMap *mm, char *dev = 0);
+  ~OSD();
+  
+  // startup/shutdown
+  int init();
+  int shutdown();
+
+  // messages
+  virtual void dispatch(Message *m);
+  virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+  virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst);
+
+  void handle_osd_ping(class MOSDPing *m);
+  void handle_op(class MOSDOp *m);
+
+  void op_read(class MOSDOp *m);//, PG *pg);
+  void op_stat(class MOSDOp *m);//, PG *pg);
+  void op_modify(class MOSDOp *m, PG *pg);
+  void op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru);
+
+  // for replication
+  void handle_op_reply(class MOSDOpReply *m);
+
+  void force_remount();
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osd/OSDMap.h b/branches/sage/cephmds2/osd/OSDMap.h
new file mode 100644
index 0000000000000..2c00eea9cdbdc
--- /dev/null
+++ b/branches/sage/cephmds2/osd/OSDMap.h
@@ -0,0 +1,515 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __OSDMAP_H
+#define __OSDMAP_H
+
+/*
+ * describe properties of the OSD cluster.
+ *   disks, disk groups, total # osds,
+ *
+ */
+#include "config.h"
+#include "include/types.h"
+#include "msg/Message.h"
+#include "common/Mutex.h"
+#include "common/Clock.h"
+
+#include "crush/crush.h"
+using namespace crush;
+
+#include <vector>
+#include <list>
+#include <set>
+#include <map>
+using namespace std;
+
+
+/*
+ * some system constants
+ */
+
+// from LSB to MSB,
+#define PG_PS_BITS         16  // max bits for placement seed/group portion of PG
+#define PG_REP_BITS        6   // up to 64 replicas   
+#define PG_TYPE_BITS       2
+#define PG_PS_MASK         ((1LL<<PG_PS_BITS)-1)
+
+#define PG_TYPE_RAND     1   // default: distribution randomly
+#define PG_TYPE_STARTOSD 2   // place primary on a specific OSD (named by the pg_bits)
+
+// pg roles
+#define PG_ROLE_STRAY   -1
+#define PG_ROLE_HEAD     0
+#define PG_ROLE_ACKER    1
+#define PG_ROLE_MIDDLE   2  // der.. misnomer
+//#define PG_ROLE_TAIL     2
+
+
+
+/** OSDMap
+ */
+class OSDMap {
+
+public:
+  class Incremental {
+  public:
+    epoch_t epoch;   // new epoch; we are a diff from epoch-1 to epoch
+    epoch_t mon_epoch;  // monitor epoch (election iteration)
+    utime_t ctime;
+    map<int,entity_inst_t> new_up;
+    map<int,entity_inst_t> new_down;
+    list<int> new_in;
+    list<int> new_out;
+    map<int,float> new_overload;  // updated overload value
+    list<int>      old_overload;  // no longer overload
+    
+    void encode(bufferlist& bl) {
+      bl.append((char*)&epoch, sizeof(epoch));
+      bl.append((char*)&mon_epoch, sizeof(mon_epoch));
+      bl.append((char*)&ctime, sizeof(ctime));
+      ::_encode(new_up, bl);
+      ::_encode(new_down, bl);
+      ::_encode(new_in, bl);
+      ::_encode(new_out, bl);
+      ::_encode(new_overload, bl);
+    }
+    void decode(bufferlist& bl, int& off) {
+      bl.copy(off, sizeof(epoch), (char*)&epoch);
+      off += sizeof(epoch);
+      bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+      off += sizeof(mon_epoch);
+      bl.copy(off, sizeof(ctime), (char*)&ctime);
+      off += sizeof(ctime);
+      ::_decode(new_up, bl, off);
+      ::_decode(new_down, bl, off);
+      ::_decode(new_in, bl, off);
+      ::_decode(new_out, bl, off);
+      ::_decode(new_overload, bl, off);
+    }
+
+    Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {}
+  };
+
+private:
+  epoch_t   epoch;       // what epoch of the osd cluster descriptor is this
+  epoch_t   mon_epoch;  // monitor epoch (election iteration)
+  utime_t   ctime;       // epoch start time
+  int       pg_bits;     // placement group bits 
+  int       localized_pg_bits;  // bits for localized pgs
+
+  set<int>  osds;        // all osds
+  set<int>  down_osds;   // list of down disks
+  set<int>  out_osds;    // list of unmapped disks
+  map<int,float> overload_osds; 
+  map<int,entity_inst_t> osd_inst;
+
+ public:
+  Crush     crush;       // hierarchical map
+
+  friend class OSDMonitor;
+  friend class MDS;
+
+ public:
+  OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {}
+
+  // map info
+  epoch_t get_epoch() const { return epoch; }
+  void inc_epoch() { epoch++; }
+
+  int get_pg_bits() const { return pg_bits; }
+  void set_pg_bits(int b) { pg_bits = b; }
+  int get_localized_pg_bits() const { return localized_pg_bits; }
+
+  const utime_t& get_ctime() const { return ctime; }
+
+  bool is_mkfs() const { return epoch == 1; }
+  //void set_mkfs() { assert(epoch == 1); }
+
+  /***** cluster state *****/
+  int num_osds() { return osds.size(); }
+  void get_all_osds(set<int>& ls) { ls = osds; }
+
+  const set<int>& get_osds() { return osds; }
+  const set<int>& get_down_osds() { return down_osds; }
+  const set<int>& get_out_osds() { return out_osds; }
+  const map<int,float>& get_overload_osds() { return overload_osds; }
+  
+  bool is_down(int osd) { return down_osds.count(osd); }
+  bool is_up(int osd) { return !is_down(osd); }
+  bool is_out(int osd) { return out_osds.count(osd); }
+  bool is_in(int osd) { return !is_out(osd); }
+  
+  const entity_inst_t& get_inst(int osd) {
+    assert(osd_inst.count(osd));
+    return osd_inst[osd];
+  }
+  bool get_inst(int osd, entity_inst_t& inst) { 
+    if (osd_inst.count(osd)) {
+      inst = osd_inst[osd];
+      return true;
+    } 
+    return false;
+  }
+  
+  void mark_down(int o) { down_osds.insert(o); }
+  void mark_up(int o) { down_osds.erase(o); }
+  void mark_out(int o) { out_osds.insert(o); }
+  void mark_in(int o) { out_osds.erase(o); }
+
+
+  void apply_incremental(Incremental &inc) {
+    assert(inc.epoch == epoch+1);
+    epoch++;
+    mon_epoch = inc.mon_epoch;
+    ctime = inc.ctime;
+
+    for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+         i != inc.new_up.end(); 
+         i++) {
+      assert(down_osds.count(i->first));
+      down_osds.erase(i->first);
+      assert(osd_inst.count(i->first) == 0);
+      osd_inst[i->first] = i->second;
+      //cout << "epoch " << epoch << " up osd" << i->first << endl;
+    }
+    for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+         i != inc.new_down.end();
+         i++) {
+      assert(down_osds.count(i->first) == 0);
+      down_osds.insert(i->first);
+      assert(osd_inst.count(i->first) == 0 ||
+             osd_inst[i->first] == i->second);
+      osd_inst.erase(i->first);
+      //cout << "epoch " << epoch << " down osd" << i->first << endl;
+    }
+    for (list<int>::iterator i = inc.new_in.begin();
+         i != inc.new_in.end();
+         i++) {
+      assert(out_osds.count(*i));
+      out_osds.erase(*i);
+      //cout << "epoch " << epoch << " in osd" << *i << endl;
+    }
+    for (list<int>::iterator i = inc.new_out.begin();
+         i != inc.new_out.end();
+         i++) {
+      assert(out_osds.count(*i) == 0);
+      out_osds.insert(*i);
+      //cout << "epoch " << epoch << " out osd" << *i << endl;
+    }
+    for (map<int,float>::iterator i = inc.new_overload.begin();
+         i != inc.new_overload.end();
+         i++) {
+      overload_osds[i->first] = i->second;
+    }
+    for (list<int>::iterator i = inc.old_overload.begin();
+         i != inc.old_overload.end();
+         i++) {
+      assert(overload_osds.count(*i));
+      overload_osds.erase(*i);
+    }
+  }
+
+  // serialize, unserialize
+  void encode(bufferlist& blist) {
+    blist.append((char*)&epoch, sizeof(epoch));
+    blist.append((char*)&mon_epoch, sizeof(mon_epoch));
+    blist.append((char*)&ctime, sizeof(ctime));
+    blist.append((char*)&pg_bits, sizeof(pg_bits));
+    
+    _encode(osds, blist);
+    _encode(down_osds, blist);
+    _encode(out_osds, blist);
+    _encode(overload_osds, blist);
+    _encode(osd_inst, blist);
+    
+    crush._encode(blist);
+  }
+  
+  void decode(bufferlist& blist) {
+    int off = 0;
+    blist.copy(off, sizeof(epoch), (char*)&epoch);
+    off += sizeof(epoch);
+    blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+    off += sizeof(mon_epoch);
+    blist.copy(off, sizeof(ctime), (char*)&ctime);
+    off += sizeof(ctime);
+    blist.copy(off, sizeof(pg_bits), (char*)&pg_bits);
+    off += sizeof(pg_bits);
+    
+    _decode(osds, blist, off);
+    _decode(down_osds, blist, off);
+    _decode(out_osds, blist, off);
+    _decode(overload_osds, blist, off);
+    _decode(osd_inst, blist, off);
+    
+    crush._decode(blist, off);
+  }
+ 
+
+
+
+  /****   mapping facilities   ****/
+
+  // oid -> pg
+  pg_t object_to_pg(object_t oid, FileLayout& layout) {
+    static crush::Hash H(777);
+        
+    int policy = layout.object_layout;
+    if (policy == 0) 
+      policy = g_conf.osd_object_layout;
+
+    int type = PG_TYPE_RAND;
+    ps_t ps;
+
+    switch (policy) {
+    case OBJECT_LAYOUT_LINEAR:
+      {
+        //const object_t ono = oid.bno;
+        //const inodeno_t ino = oid >> OID_ONO_BITS;
+        ps = (oid.bno + oid.ino) & PG_PS_MASK;
+        ps &= ((1ULL<<pg_bits)-1ULL);
+      }
+      break;
+      
+    case OBJECT_LAYOUT_HASHINO:
+      {
+        //const object_t ono = oid & ((1ULL << OID_ONO_BITS)-1ULL);
+        //const inodeno_t ino = oid >> OID_ONO_BITS;
+        ps = (oid.bno + H(oid.ino)) & PG_PS_MASK;
+        ps &= ((1ULL<<pg_bits)-1ULL);
+      }
+      break;
+
+    case OBJECT_LAYOUT_HASH:
+      {
+        ps = H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ) & PG_PS_MASK;
+        ps &= ((1ULL<<pg_bits)-1ULL);
+      }
+      break;
+
+    case OBJECT_LAYOUT_STARTOSD:
+      {
+        ps = layout.osd;
+        type = PG_TYPE_STARTOSD;
+      }
+      break;
+    }
+
+    // construct final PG
+    /*pg_t pg = type;
+    pg = (pg << PG_REP_BITS) | (pg_t)layout.num_rep;
+    pg = (pg << PG_PS_BITS) | ps;
+    */
+    //cout << "pg " << hex << pg << dec << endl;
+    return pg_t(ps, 0, layout.num_rep);
+  }
+
+  // (ps, nrep) -> pg
+  pg_t ps_nrep_to_pg(ps_t ps, int nrep) {
+    /*return ((pg_t)ps & ((1ULL<<pg_bits)-1ULL)) 
+      | ((pg_t)nrep << PG_PS_BITS)
+      | ((pg_t)PG_TYPE_RAND << (PG_PS_BITS+PG_REP_BITS));
+    */
+    return pg_t(ps, 0, nrep, 0);
+  }
+  pg_t ps_osd_nrep_to_pg(ps_t ps, int osd, int nrep) {
+    /*return ((pg_t)osd)
+      | ((pg_t)nrep << PG_PS_BITS)
+      | ((pg_t)PG_TYPE_STARTOSD << (PG_PS_BITS+PG_REP_BITS));
+    */
+    return pg_t(ps, osd+1, nrep, 0);
+  }
+
+  // pg -> nrep
+  int pg_to_nrep(pg_t pg) {
+    return pg.u.fields.nrep;
+    //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1);
+  }
+
+  // pg -> ps
+  int pg_to_ps(pg_t pg) {
+    //return pg & PG_PS_MASK;
+    return pg.u.fields.ps;
+  }
+
+  // pg -> (osd list)
+  int pg_to_osds(pg_t pg,
+                 vector<int>& osds) {       // list of osd addr's
+    pg_t ps = pg_to_ps(pg);
+    int num_rep = pg_to_nrep(pg);
+    assert(num_rep > 0);
+    
+    // map to osds[]
+    switch (g_conf.osd_pg_layout) {
+    case PG_LAYOUT_CRUSH:
+      {
+	int forcefeed = -1;
+	if (pg.u.fields.preferred > 0 &&
+	    out_osds.count(pg.u.fields.preferred-1) == 0) 
+	  forcefeed = pg.u.fields.preferred-1;
+	crush.do_rule(crush.rules[num_rep],     // FIXME rule thing.
+		      ps, 
+		      osds,
+		      out_osds, overload_osds,
+		      forcefeed);
+      }
+      break;
+      
+    case PG_LAYOUT_LINEAR:
+      for (int i=0; i<num_rep; i++) 
+	osds.push_back( (i + ps*num_rep) % g_conf.num_osd );
+      break;
+      
+    case PG_LAYOUT_HYBRID:
+      {
+	static crush::Hash H(777);
+	int h = H(ps);
+	for (int i=0; i<num_rep; i++) 
+	  osds.push_back( (h+i) % g_conf.num_osd );
+      }
+      break;
+      
+    case PG_LAYOUT_HASH:
+      {
+	static crush::Hash H(777);
+	for (int i=0; i<num_rep; i++) {
+	  int t = 1;
+	  int osd = 0;
+	  while (t++) {
+	    osd = H(i, ps, t) % g_conf.num_osd;
+	    int j = 0;
+	    for (; j<i; j++) 
+	      if (osds[j] == osd) break;
+	    if (j == i) break;
+	  }
+	  osds.push_back(osd);
+	}      
+      }
+      break;
+      
+    default:
+      assert(0);
+    }
+  
+    if (pg.u.fields.preferred > 0 &&
+	g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) {
+      int osd = pg.u.fields.preferred-1;
+
+      // already in there?
+      if (osds.empty()) {
+        osds.push_back(osd);
+      } else {
+        assert(num_rep > 0);
+        for (int i=1; i<num_rep; i++)
+          if (osds[i] == osd) {
+            // swap with position 0
+            osds[i] = osds[0];
+          }
+        osds[0] = osd;
+      }
+
+      if (is_out(osd))
+        osds.erase(osds.begin());  // oops, but it's down!
+    }
+    
+    return osds.size();
+  }
+
+  // pg -> (up osd list)
+  int pg_to_acting_osds(pg_t pg,
+                        vector<int>& osds) {         // list of osd addr's
+    // get rush list
+    vector<int> raw;
+    pg_to_osds(pg, raw);
+
+    osds.clear();
+    for (unsigned i=0; i<raw.size(); i++) {
+      if (is_down(raw[i])) continue;
+      osds.push_back( raw[i] );
+    }
+    return osds.size();
+  }
+
+
+
+  // pg -> primary osd
+  int get_pg_primary(pg_t pg) {
+    vector<int> group;
+    int nrep = pg_to_osds(pg, group);
+    if (nrep)
+      return group[0];
+    return -1;  // we fail!
+  }
+
+  // pg -> acting primary osd
+  int get_pg_acting_primary(pg_t pg) {
+    vector<int> group;
+    int nrep = pg_to_acting_osds(pg, group);
+    if (nrep > 0)
+      return group[0];
+    return -1;  // we fail!
+  }
+  int get_pg_acting_tail(pg_t pg) {
+    vector<int> group;
+    int nrep = pg_to_acting_osds(pg, group);
+    if (nrep > 0)
+      return group[group.size()-1];
+    return -1;  // we fail!
+  }
+
+
+  /* what replica # is a given osd? 0 primary, -1 for none. */
+  int calc_pg_rank(int osd, vector<int>& acting, int nrep=0) {
+    if (!nrep) nrep = acting.size();
+    for (int i=0; i<nrep; i++) 
+      if (acting[i] == osd) return i;
+    return -1;
+  }
+  int calc_pg_role(int osd, vector<int>& acting, int nrep=0) {
+    if (!nrep) nrep = acting.size();
+    int rank = calc_pg_rank(osd, acting, nrep);
+
+    if (rank < 0) return PG_ROLE_STRAY;
+    else if (rank == 0) return PG_ROLE_HEAD;
+    else if (rank == 1) return PG_ROLE_ACKER;
+    else return PG_ROLE_MIDDLE;
+  }
+  
+  int get_pg_role(pg_t pg, int osd) {
+    vector<int> group;
+    int nrep = pg_to_osds(pg, group);
+    return calc_pg_role(osd, group, nrep);
+  }
+
+  /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
+  int get_pg_acting_rank(pg_t pg, int osd) {
+    vector<int> group;
+    int nrep = pg_to_acting_osds(pg, group);
+    return calc_pg_rank(osd, group, nrep);
+  }
+  /* role is -1 (stray), 0 (primary), 1 (replica) */
+  int get_pg_acting_role(pg_t pg, int osd) {
+    vector<int> group;
+    int nrep = pg_to_acting_osds(pg, group);
+    return calc_pg_role(osd, group, nrep);
+  }
+
+
+
+
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/osd/ObjectStore.cc b/branches/sage/cephmds2/osd/ObjectStore.cc
new file mode 100644
index 0000000000000..82af869e93775
--- /dev/null
+++ b/branches/sage/cephmds2/osd/ObjectStore.cc
@@ -0,0 +1,149 @@
+
+#include "ObjectStore.h"
+
+#include "config.h"
+#include "common/Clock.h"
+
+
+object_t ObjectStore::age_get_oid() {
+    if (!age_free_oids.empty()) {
+      object_t o = age_free_oids.front();
+      age_free_oids.pop_front();
+      return o;
+    }
+    return age_cur_oid++;
+  }
+
+  ssize_t ObjectStore::age_pick_size() {
+    ssize_t max = file_size_distn.sample() * 1024;
+    return max/2 + (rand() % 100) * max/200 + 1;
+  }
+
+  void ObjectStore::age_fill(float pc, utime_t until) {
+	bufferptr bp(1024*1024);
+	bp.zero();
+    bufferlist bl;
+    bl.push_back(bp);
+    while (1) {
+      if (g_clock.now() > until) break;
+
+      struct statfs st;
+      statfs(&st);
+      float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+      if (a >= pc) {
+        dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl;
+        break;
+      }
+
+      object_t oid = age_get_oid();
+
+      int b = rand() % 10;
+      age_objects[b].push_back(oid);
+
+      ssize_t s = age_pick_size();
+
+      dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl;
+
+      off_t off = 0;
+      while (s) {
+        ssize_t t = MIN(s, 1024*1024);
+        write(oid, t, off, bl, false);
+        off += t;
+        s -= t;
+      }
+      oid++;
+    }
+  }
+
+  void ObjectStore::age_empty(float pc) {
+    int nper = 20;
+    int n = nper;
+    while (1) {
+      struct statfs st;
+      statfs(&st);
+      float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+      if (a <= pc) {
+        dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl;
+        break;
+      }
+
+      int b = rand() % 10;
+      n--;
+      if (n == 0 || age_objects[b].empty()) {
+        dout(10) << "age_empty sync" << endl;
+        //sync();
+        sync();
+        n = nper;
+        continue;
+      }
+      object_t oid = age_objects[b].front();
+      age_objects[b].pop_front();
+
+      dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl;
+
+      remove(oid);
+      age_free_oids.push_back(oid);
+    }
+  }
+
+
+  void ObjectStore::age(int time,
+                        float high_water,    // fill to this %
+          float low_water,     // then empty to this %
+          int count,         // this many times
+          float final_water,   // and end here ( <= low_water)
+          int fake_size_mb) { 
+    utime_t until = g_clock.now();
+    until.sec_ref() += time;
+
+    while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+
+    if (fake_size_mb) {
+      int fake_bl = fake_size_mb * 256;
+      struct statfs st;
+      statfs(&st);
+      float f = (float)fake_bl / (float)st.f_blocks;
+      high_water = (float)high_water * f;
+      low_water = (float)low_water * f;
+      final_water = (float)final_water * f;
+      dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl;
+    }
+
+    // init size distn (once)
+    if (!did_distn) {
+      did_distn = true;
+      age_cur_oid = 1;
+      file_size_distn.add(1, 19.0758125+0.65434375);
+      file_size_distn.add(512, 35.6566);
+      file_size_distn.add(1024, 27.7271875);
+      file_size_distn.add(2*1024, 16.63503125);
+      //file_size_distn.add(4*1024, 106.82384375);
+      //file_size_distn.add(8*1024, 81.493375);
+      //file_size_distn.add(16*1024, 14.13553125);
+      //file_size_distn.add(32*1024, 2.176);
+      //file_size_distn.add(256*1024, 0.655938);
+      //file_size_distn.add(512*1024, 0.1480625);
+      //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+      file_size_distn.normalize();
+    }
+
+    // clear
+    for (int i=0; i<10; i++)
+      age_objects[i].clear();
+
+    for (int c=1; c<=count; c++) {
+      if (g_clock.now() > until) break;
+
+      dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl;
+      age_fill(high_water, until);
+      if (c == count) {
+        dout(1) << "age final empty to " << final_water << endl;
+        age_empty(final_water);    
+      } else {
+        dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl;
+        age_empty(low_water);
+      }
+    }
+    dout(1) << "age finished" << endl;
+  }  
+  
diff --git a/branches/sage/cephmds2/osd/ObjectStore.h b/branches/sage/cephmds2/osd/ObjectStore.h
new file mode 100644
index 0000000000000..21fbd867974ed
--- /dev/null
+++ b/branches/sage/cephmds2/osd/ObjectStore.h
@@ -0,0 +1,479 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __OBJECTSTORE_H
+#define __OBJECTSTORE_H
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "include/buffer.h"
+
+#include "include/Distribution.h"
+
+#include <sys/stat.h>
+#include <sys/vfs.h>    /* or <sys/statfs.h> */
+
+#include <list>
+using namespace std;
+
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+
+
+class ObjectStore {
+public:
+
+
+  class FragmentationStat {
+  public:
+    int total;
+    int num_extent;
+    int avg_extent;
+    map<int,int> extent_dist;          // powers of two
+    map<int,int> extent_dist_sum;          // powers of two
+
+    float avg_extent_per_object;
+    int avg_extent_jump;  // avg distance bweteen consecutive extents
+
+    int total_free;
+    int num_free_extent;
+    int avg_free_extent;
+    map<int,int> free_extent_dist;     // powers of two
+    map<int,int> free_extent_dist_sum;     // powers of two
+  };
+  
+  
+
+  /*********************************
+   * transaction
+   */
+  class Transaction {
+  public:
+    static const int OP_READ =          1;  // oid, offset, len, pbl
+    static const int OP_STAT =          2;  // oid, pstat
+    static const int OP_GETATTR =       3;  // oid, attrname, pattrval
+    static const int OP_GETATTRS =      4;  // oid, pattrset
+
+    static const int OP_WRITE =        10;  // oid, offset, len, bl
+    static const int OP_TRUNCATE =     11;  // oid, len
+    static const int OP_REMOVE =       13;  // oid
+    static const int OP_SETATTR =      14;  // oid, attrname, attrval
+    static const int OP_SETATTRS =     15;  // oid, attrset
+    static const int OP_RMATTR =       16;  // oid, attrname
+    static const int OP_CLONE =        17;  // oid, newoid
+
+    static const int OP_MKCOLL =       20;  // cid
+    static const int OP_RMCOLL =       21;  // cid
+    static const int OP_COLL_ADD =     22;  // cid, oid
+    static const int OP_COLL_REMOVE =  23;  // cid, oid
+    static const int OP_COLL_SETATTR = 24;  // cid, attrname, attrval
+    static const int OP_COLL_RMATTR =  25;  // cid, attrname
+
+    list<int> ops;
+    list<bufferlist> bls;
+    list<object_t> oids;
+    list<coll_t>   cids;
+    list<off_t>    offsets;
+    list<size_t>   lengths;
+    list<const char*> attrnames;
+    //list< pair<const void*,int> > attrvals;
+    list<bufferlist>  attrbls;
+
+    list<bufferlist*> pbls;
+    list<struct stat*> psts;
+    list< pair<void*,int*> > pattrvals;
+    list< map<string,bufferptr>* > pattrsets;
+
+    void read(object_t oid, off_t off, size_t len, bufferlist *pbl) {
+      int op = OP_READ;
+      ops.push_back(op);
+      oids.push_back(oid);
+      offsets.push_back(off);
+      lengths.push_back(len);
+      pbls.push_back(pbl);
+    }
+    void stat(object_t oid, struct stat *st) {
+      int op = OP_STAT;
+      ops.push_back(op);
+      oids.push_back(oid);
+      psts.push_back(st);
+    }
+    void getattr(object_t oid, const char* name, void* val, int *plen) {
+      int op = OP_GETATTR;
+      ops.push_back(op);
+      oids.push_back(oid);
+      attrnames.push_back(name);
+      pattrvals.push_back(pair<void*,int*>(val,plen));
+    }
+    void getattrs(object_t oid, map<string,bufferptr>& aset) {
+      int op = OP_GETATTRS;
+      ops.push_back(op);
+      oids.push_back(oid);
+      pattrsets.push_back(&aset);
+    }
+
+    void write(object_t oid, off_t off, size_t len, bufferlist& bl) {
+      int op = OP_WRITE;
+      ops.push_back(op);
+      oids.push_back(oid);
+      offsets.push_back(off);
+      lengths.push_back(len);
+      bls.push_back(bl);
+    }
+    void truncate(object_t oid, off_t off) {
+      int op = OP_TRUNCATE;
+      ops.push_back(op);
+      oids.push_back(oid);
+      offsets.push_back(off);
+    }
+    void remove(object_t oid) {
+      int op = OP_REMOVE;
+      ops.push_back(op);
+      oids.push_back(oid);
+    }
+    void setattr(object_t oid, const char* name, const void* val, int len) {
+      int op = OP_SETATTR;
+      ops.push_back(op);
+      oids.push_back(oid);
+      attrnames.push_back(name);
+      //attrvals.push_back(pair<const void*,int>(val,len));
+      bufferlist bl;
+      bl.append((char*)val,len);
+      attrbls.push_back(bl);
+    }
+    void setattrs(object_t oid, map<string,bufferptr>& attrset) {
+      int op = OP_SETATTRS;
+      ops.push_back(op);
+      oids.push_back(oid);
+      pattrsets.push_back(&attrset);
+    }
+    void rmattr(object_t oid, const char* name) {
+      int op = OP_RMATTR;
+      ops.push_back(op);
+      oids.push_back(oid);
+      attrnames.push_back(name);
+    }
+    void clone(object_t oid, object_t noid) {
+      int op = OP_CLONE;
+      ops.push_back(op);
+      oids.push_back(oid);
+      oids.push_back(noid);
+    }
+    void create_collection(coll_t cid) {
+      int op = OP_MKCOLL;
+      ops.push_back(op);
+      cids.push_back(cid);
+    }
+    void remove_collection(coll_t cid) {
+      int op = OP_RMCOLL;
+      ops.push_back(op);
+      cids.push_back(cid);
+    }
+    void collection_add(coll_t cid, object_t oid) {
+      int op = OP_COLL_ADD;
+      ops.push_back(op);
+      cids.push_back(cid);
+      oids.push_back(oid);
+    }
+    void collection_remove(coll_t cid, object_t oid) {
+      int op = OP_COLL_REMOVE;
+      ops.push_back(op);
+      cids.push_back(cid);
+      oids.push_back(oid);
+    }
+    void collection_setattr(coll_t cid, const char* name, const void* val, int len) {
+      int op = OP_COLL_SETATTR;
+      ops.push_back(op);
+      cids.push_back(cid);
+      attrnames.push_back(name);
+      //attrvals.push_back(pair<const void*,int>(val,len));
+      bufferlist bl;
+      bl.append((char*)val, len);
+      attrbls.push_back(bl);
+    }
+    void collection_rmattr(coll_t cid, const char* name) {
+      int op = OP_COLL_RMATTR;
+      ops.push_back(op);
+      cids.push_back(cid);
+      attrnames.push_back(name);
+    }
+
+    // etc.
+  };
+
+
+
+  /* this implementation is here only for naive ObjectStores that
+   * do not do atomic transactions natively.  it is not atomic.
+   */
+  virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) {
+    // non-atomic implementation
+    for (list<int>::iterator p = t.ops.begin();
+         p != t.ops.end();
+         p++) {
+      switch (*p) {
+      case Transaction::OP_READ:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          off_t offset = t.offsets.front(); t.offsets.pop_front();
+          size_t len = t.lengths.front(); t.lengths.pop_front();
+          bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+          read(oid, offset, len, *pbl);
+        }
+        break;
+      case Transaction::OP_STAT:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          struct stat *st = t.psts.front(); t.psts.pop_front();
+          stat(oid, st);
+        }
+        break;
+      case Transaction::OP_GETATTR:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+          pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+          *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second);
+        }
+        break;
+      case Transaction::OP_GETATTRS:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+          getattrs(oid, *pset);
+        }
+        break;
+
+      case Transaction::OP_WRITE:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          off_t offset = t.offsets.front(); t.offsets.pop_front();
+          size_t len = t.lengths.front(); t.lengths.pop_front();
+          bufferlist bl = t.bls.front(); t.bls.pop_front();
+          write(oid, offset, len, bl, 0);
+        }
+        break;
+
+      case Transaction::OP_TRUNCATE:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          off_t len = t.offsets.front(); t.offsets.pop_front();
+          truncate(oid, len, 0);
+        }
+        break;
+
+      case Transaction::OP_REMOVE:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          remove(oid, 0);
+        }
+        break;
+
+      case Transaction::OP_SETATTR:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+          //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+          bufferlist bl;
+          bl.claim( t.attrbls.front() );
+          t.attrbls.pop_front();
+          setattr(oid, attrname, bl.c_str(), bl.length(), 0);
+        }
+        break;
+      case Transaction::OP_SETATTRS:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+          setattrs(oid, *pattrset, 0);
+        }
+        break;
+
+      case Transaction::OP_RMATTR:
+        {
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+          rmattr(oid, attrname, 0);
+        }
+        break;
+
+      case Transaction::OP_CLONE:
+	{
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          object_t noid = t.oids.front(); t.oids.pop_front();
+	  clone(oid, noid);
+	}
+	break;
+
+      case Transaction::OP_MKCOLL:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          create_collection(cid, 0);
+        }
+        break;
+
+      case Transaction::OP_RMCOLL:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          destroy_collection(cid, 0);
+        }
+        break;
+
+      case Transaction::OP_COLL_ADD:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          collection_add(cid, oid, 0);
+        }
+        break;
+
+      case Transaction::OP_COLL_REMOVE:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          object_t oid = t.oids.front(); t.oids.pop_front();
+          collection_remove(cid, oid, 0);
+        }
+        break;
+
+      case Transaction::OP_COLL_SETATTR:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+          //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+          bufferlist bl;
+          bl.claim( t.attrbls.front() );
+          t.attrbls.pop_front();
+          collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0);
+        }
+        break;
+
+      case Transaction::OP_COLL_RMATTR:
+        {
+          coll_t cid = t.cids.front(); t.cids.pop_front();
+          const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+          collection_rmattr(cid, attrname, 0);
+        }
+        break;
+
+
+      default:
+        cerr << "bad op " << *p << endl;
+        assert(0);
+      }
+    }
+
+    if (onsafe) sync(onsafe);
+
+    return 0;  // FIXME count errors
+  }
+
+  /*********************************************/
+
+
+
+ public:
+  ObjectStore() {}
+  virtual ~ObjectStore() {}
+
+  // mgmt
+  virtual int mount() = 0;
+  virtual int umount() = 0;
+  virtual int mkfs() = 0;  // wipe
+
+  virtual int statfs(struct statfs *buf) = 0;
+
+  // objects
+  virtual int pick_object_revision_lt(object_t& oid) = 0;
+
+  virtual bool exists(object_t oid) = 0;                   // useful?
+  virtual int stat(object_t oid, struct stat *st) = 0;     // struct stat?
+
+  virtual int remove(object_t oid,
+                     Context *onsafe=0) = 0;
+
+  virtual int truncate(object_t oid, off_t size,
+                       Context *onsafe=0) = 0;
+  
+  virtual int read(object_t oid, 
+                   off_t offset, size_t len,
+                   bufferlist& bl) = 0;
+
+  /*virtual int write(object_t oid,
+                    off_t offset, size_t len, 
+                    bufferlist& bl,
+                    bool fsync=true) = 0;     
+  */
+  virtual int write(object_t oid, 
+                    off_t offset, size_t len,
+                    bufferlist& bl, 
+                    Context *onsafe) = 0;//{ return -1; }
+
+  virtual int setattr(object_t oid, const char *name,
+                      const void *value, size_t size,
+                      Context *onsafe=0) {return 0;} //= 0;
+  virtual int setattrs(object_t oid, map<string,bufferptr>& aset,
+                      Context *onsafe=0) {return 0;} //= 0;
+  virtual int getattr(object_t oid, const char *name,
+                      void *value, size_t size) {return 0;} //= 0;
+  virtual int getattrs(object_t oid, map<string,bufferptr>& aset) {return 0;};
+
+  virtual int rmattr(object_t oid, const char *name,
+                     Context *onsafe=0) {return 0;}
+
+  virtual int clone(object_t oid, object_t noid) {
+    return -1; 
+  }
+
+  virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0;
+  
+  // collections
+  virtual int list_collections(list<coll_t>& ls) {return 0;}//= 0;
+  virtual int create_collection(coll_t c,
+                                Context *onsafe=0) {return 0;}//= 0;
+  virtual int destroy_collection(coll_t c,
+                                 Context *onsafe=0) {return 0;}//= 0;
+  virtual bool collection_exists(coll_t c) {return 0;}
+  virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0;
+  virtual int collection_add(coll_t c, object_t o,
+                             Context *onsafe=0) {return 0;}//= 0;
+  virtual int collection_remove(coll_t c, object_t o,
+                                Context *onsafe=0) {return 0;}// = 0;
+  virtual int collection_list(coll_t c, list<object_t>& o) {return 0;}//= 0;
+
+  virtual int collection_setattr(coll_t cid, const char *name,
+                                 const void *value, size_t size,
+                                 Context *onsafe=0) {return 0;} //= 0;
+  virtual int collection_rmattr(coll_t cid, const char *name,
+                                Context *onsafe=0) {return 0;} //= 0;
+  virtual int collection_getattr(coll_t cid, const char *name,
+                                 void *value, size_t size) {return 0;} //= 0;
+  virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0;
+  
+  virtual void sync(Context *onsync) {};
+  virtual void sync() {};
+  
+  
+  virtual void _fake_writes(bool b) {};
+
+  virtual void _get_frag_stat(FragmentationStat& st) {};
+  
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/osd/PG.cc b/branches/sage/cephmds2/osd/PG.cc
new file mode 100644
index 0000000000000..4dee6f03bd166
--- /dev/null
+++ b/branches/sage/cephmds2/osd/PG.cc
@@ -0,0 +1,1312 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "PG.h"
+#include "config.h"
+#include "OSD.h"
+
+#include "common/Timer.h"
+
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
+
+
+/******* PGLog ********/
+
+void PG::Log::copy_after(const Log &other, eversion_t v) 
+{
+  assert(v >= other.bottom);
+  top = bottom = other.top;
+  for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+       i != other.log.rend();
+       i++) {
+    if (i->version == v) break;
+    assert(i->version > v);
+    log.push_front(*i);
+  }
+  bottom = v;
+}
+
+bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) 
+{
+  assert(split >= other.bottom);
+  assert(floor >= other.bottom);
+  assert(floor <= split);
+  top = bottom = other.top;
+  
+  /* runs on replica.  split is primary's log.top.  floor is how much they want.
+     split tell us if the primary is divergent.. e.g.:
+     -> i am A, B is primary, split is 2'6, floor is 2'2.
+A     B     C
+2'2         2'2
+2'3   2'3   2'3
+2'4   2'4   2'4
+3'5 | 2'5   2'5
+3'6 | 2'6
+3'7 |
+3'8 |
+3'9 |
+      -> i return full backlog.
+  */
+
+  for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+       i != other.log.rend();
+       i++) {
+    // is primary divergent? 
+    // e.g. my 3'6 vs their 2'6 split
+    if (i->version.version == split.version && i->version.epoch > split.epoch) {
+      clear();
+      return false;   // divergent!
+    }
+    if (i->version == floor) break;
+    assert(i->version > floor);
+
+    // e.g. my 2'23 > '12
+    log.push_front(*i);
+  }
+  bottom = floor;
+  return true;
+}
+
+void PG::Log::copy_non_backlog(const Log &other)
+{
+  if (other.backlog) {
+    top = other.top;
+    bottom = other.bottom;
+    for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+         i != other.log.rend();
+         i++) 
+      if (i->version > bottom)
+        log.push_front(*i);
+      else
+        break;
+  } else {
+    *this = other;
+  }
+}
+
+
+
+void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) 
+{
+  if (backlog && s < bottom)
+    s = bottom;
+
+  while (!log.empty()) {
+    Entry &e = *log.begin();
+
+    if (e.version > s) break;
+
+    assert(complete_to != log.begin());
+    assert(requested_to != log.begin());
+
+    // remove from index,
+    unindex(e);
+
+    // from log
+    log.pop_front();
+  }
+  
+  // raise bottom?
+  if (backlog) backlog = false;
+  if (bottom < s) bottom = s;
+}
+
+
+void PG::IndexedLog::trim_write_ahead(eversion_t last_update) 
+{
+  while (!log.empty() &&
+         log.rbegin()->version > last_update) {
+    // remove from index
+    unindex(*log.rbegin());
+    
+    // remove
+    log.pop_back();
+  }
+}
+
+void PG::trim_write_ahead()
+{
+  if (info.last_update < log.top) {
+    dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << endl;
+    log.trim_write_ahead(info.last_update);
+  } else {
+    assert(info.last_update == log.top);
+    dout(10) << "trim_write_ahead last_update=top=" << info.last_update << endl;
+  }
+
+}
+
+void PG::proc_replica_log(Log &olog, Missing& omissing, int from)
+{
+  dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << endl;
+  assert(!is_active());
+
+  if (!have_master_log) {
+    // i'm building master log.
+    // note peer's missing.
+    peer_missing[from] = omissing;
+    
+    // merge log into our own log
+    merge_log(olog, omissing, from);
+    proc_missing(olog, omissing, from);
+  } else {
+    // i'm just building missing lists.
+    peer_missing[from] = omissing;
+
+    // iterate over peer log. in reverse.
+    list<Log::Entry>::reverse_iterator pp = olog.log.rbegin();
+    eversion_t lu = peer_info[from].last_update;
+    while (pp != olog.log.rend()) {
+      if (!log.objects.count(pp->oid)) {
+        dout(10) << " divergent " << *pp << " not in our log, generating backlog" << endl;
+        generate_backlog();
+      }
+      
+      if (!log.objects.count(pp->oid)) {
+        dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << endl;
+        ++pp;
+        continue;
+      } 
+
+      if (log.objects[pp->oid]->version == pp->version) {
+        break;  // we're no longer divergent.
+        //++pp;
+        //continue;
+      }
+
+      if (log.objects[pp->oid]->version > pp->version) {
+        dout(10) << " divergent " << *pp
+                 << " superceded by " << log.objects[pp->oid]
+                 << ", ignoring" << endl;
+      } else {
+        dout(10) << " divergent " << *pp << ", adding to missing" << endl;
+        peer_missing[from].add(pp->oid, pp->version);
+      }
+
+      ++pp;
+      if (pp != olog.log.rend())
+        lu = pp->version;
+      else
+        lu = olog.bottom;
+    }    
+
+    if (lu < peer_info[from].last_update) {
+      dout(10) << " peer osd" << from << " last_update now " << lu << endl;
+      peer_info[from].last_update = lu;
+      if (lu < oldest_update) {
+        dout(10) << " oldest_update now " << lu << endl;
+        oldest_update = lu;
+      }
+    }
+
+    proc_missing(olog, peer_missing[from], from);
+  }
+}
+
+void PG::merge_log(Log &olog, Missing &omissing, int fromosd)
+{
+  dout(10) << "merge_log " << olog << " from osd" << fromosd
+           << " into " << log << endl;
+
+  //cout << "log" << endl;
+  //log.print(cout);
+  //cout << "olog" << endl;
+  //olog.print(cout);
+  
+  if (log.empty() ||
+      (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) 
+
+    // swap and index
+    log.log.swap(olog.log);
+    log.index();
+
+    // find split point (old log.top) in new log
+    // add new items to missing along the way.
+    for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+         p != log.log.rend();
+         p++) {
+      if (p->version <= log.top) {
+        // ok, p is at split point.
+
+        // was our old log divergent?
+        if (log.top > p->version) { 
+          dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << endl;
+          if (p->version < oldest_update)
+            oldest_update = p->version;
+          
+          while (!olog.log.empty() && 
+                 olog.log.rbegin()->version > p->version) {
+            Log::Entry &oe = *olog.log.rbegin();  // old entry (possibly divergent)
+            if (log.objects.count(oe.oid)) {
+              if (log.objects[oe.oid]->version < oe.version) {
+                dout(10) << "merge_log  divergent entry " << oe
+                         << " not superceded by " << *log.objects[oe.oid]
+                         << ", adding to missing" << endl;
+                missing.add(oe.oid, oe.version);
+              } else {
+                dout(10) << "merge_log  divergent entry " << oe
+                         << " superceded by " << *log.objects[oe.oid] 
+                         << ", ignoring" << endl;
+              }
+            } else {
+              dout(10) << "merge_log  divergent entry " << oe << ", adding to missing" << endl;
+              missing.add(oe.oid, oe.version);
+            }
+            olog.log.pop_back();  // discard divergent entry
+          }
+        }
+        break;
+      }
+
+      if (p->is_delete()) {
+        dout(10) << "merge_log merging " << *p << ", not missing" << endl;
+        missing.rm(p->oid, p->version);
+      } else {
+        dout(10) << "merge_log merging " << *p << ", now missing" << endl;
+        missing.add(p->oid, p->version);
+      }
+    }
+
+    info.last_update = log.top = olog.top;
+    info.log_bottom = log.bottom = olog.bottom;
+    info.log_backlog = log.backlog = olog.backlog;
+  } 
+
+  else {
+    // i can merge the two logs!
+
+    // extend on bottom?
+    // FIXME: what if we have backlog, but they have lower bottom?
+    if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) {
+      dout(10) << "merge_log extending bottom to " << olog.bottom
+               << (olog.backlog ? " +backlog":"")
+             << endl;
+      
+      // ok
+      list<Log::Entry>::iterator from = olog.log.begin();
+      list<Log::Entry>::iterator to;
+      for (to = from;
+           to != olog.log.end();
+           to++) {
+        if (to->version > log.bottom) break;
+        
+        // update our index while we're here
+        log.index(*to);
+        
+        dout(15) << *to << endl;
+        
+        // new missing object?
+        if (to->version > info.last_complete) {
+          if (to->is_update()) 
+            missing.add(to->oid, to->version);
+          else 
+          missing.rm(to->oid, to->version);
+        }
+      }
+      assert(to != olog.log.end());
+      
+      // splice into our log.
+      log.log.splice(log.log.begin(),
+                     olog.log, from, to);
+      
+      info.log_bottom = log.bottom = olog.bottom;
+      info.log_backlog = log.backlog = olog.backlog;
+    }
+    
+    // extend on top?
+    if (olog.top > log.top &&
+        olog.bottom <= log.top) {
+      dout(10) << "merge_log extending top to " << olog.top << endl;
+      
+      list<Log::Entry>::iterator to = olog.log.end();
+      list<Log::Entry>::iterator from = olog.log.end();
+      while (1) {
+        if (from == olog.log.begin()) break;
+        from--;
+        //dout(0) << "? " << *from << endl;
+        if (from->version < log.top) {
+          from++;
+          break;
+        }
+        
+        log.index(*from);
+        dout(10) << "merge_log " << *from << endl;
+        
+        // add to missing
+        if (from->is_update()) {
+          missing.add(from->oid, from->version);
+        } else
+          missing.rm(from->oid, from->version);
+      }
+      
+      // remove divergent items
+      while (1) {
+        Log::Entry *oldtail = &(*log.log.rbegin());
+        if (oldtail->version.version+1 == from->version.version) break;
+
+        // divergent!
+        assert(oldtail->version.version >= from->version.version);
+        
+        if (log.objects[oldtail->oid]->version == oldtail->version) {
+          // and significant.
+          dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << endl;
+          //missing.add(oldtail->oid);
+          assert(0);
+        } else {
+          dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << endl;
+          assert(missing.is_missing(oldtail->oid));
+        }
+        log.log.pop_back();
+      }
+
+      // splice
+      log.log.splice(log.log.end(), 
+                     olog.log, from, to);
+      
+      info.last_update = log.top = olog.top;
+    }
+  }
+  
+  dout(10) << "merge_log result " << log << " " << missing << endl;
+  //log.print(cout);
+
+}
+
+void PG::proc_missing(Log &olog, Missing &omissing, int fromosd)
+{
+  // found items?
+  for (map<object_t,eversion_t>::iterator p = missing.missing.begin();
+       p != missing.missing.end();
+       p++) {
+    if (omissing.is_missing(p->first)) {
+      assert(omissing.is_missing(p->first, p->second));
+      if (omissing.loc.count(p->first)) {
+        dout(10) << "proc_missing missing " << p->first << " " << p->second
+                 << " on osd" << omissing.loc[p->first] << endl;
+        missing.loc[p->first] = omissing.loc[p->first];
+      } else {
+        dout(10) << "proc_missing missing " << p->first << " " << p->second
+                 << " also LOST on source, osd" << fromosd << endl;
+      }
+    } 
+    else if (p->second <= olog.top) {
+      dout(10) << "proc_missing missing " << p->first << " " << p->second
+               << " on source, osd" << fromosd << endl;
+      missing.loc[p->first] = fromosd;
+    } else {
+      dout(10) << "proc_missing " << p->first << " " << p->second
+               << " > olog.top " << olog.top << ", not found...."
+               << endl;
+    }
+  }
+
+  dout(10) << "proc_missing missing " << missing.missing << endl;
+}
+
+
+
+void PG::generate_backlog()
+{
+  dout(10) << "generate_backlog to " << log << endl;
+  assert(!log.backlog);
+  log.backlog = true;
+
+  list<object_t> olist;
+  osd->store->collection_list(info.pgid, olist);
+  
+  int local = 0;
+  map<eversion_t,Log::Entry> add;
+  for (list<object_t>::iterator it = olist.begin();
+       it != olist.end();
+       it++) {
+    local++;
+
+    if (log.logged_object(*it)) continue; // already have it logged.
+    
+    // add entry
+    Log::Entry e;
+    e.op = Log::Entry::MODIFY;           // FIXME when we do smarter op codes!
+    e.oid = *it;
+    osd->store->getattr(*it, 
+                        "version",
+                        &e.version, sizeof(e.version));
+    add[e.version] = e;
+    dout(10) << "generate_backlog found " << e << endl;
+  }
+
+  for (map<eversion_t,Log::Entry>::reverse_iterator i = add.rbegin();
+       i != add.rend();
+       i++) {
+    log.log.push_front(i->second);
+    log.index( *log.log.begin() );    // index
+  }
+
+  dout(10) << local << " local objects, "
+           << add.size() << " objects added to backlog, " 
+           << log.objects.size() << " in pg" << endl;
+
+  //log.print(cout);
+}
+
+void PG::drop_backlog()
+{
+  dout(10) << "drop_backlog for " << log << endl;
+  //log.print(cout);
+
+  assert(log.backlog);
+  log.backlog = false;
+  
+  while (!log.log.empty()) {
+    Log::Entry &e = *log.log.begin();
+    if (e.version > log.bottom) break;
+
+    dout(15) << "drop_backlog trimming " << e.version << endl;
+    log.unindex(e);
+    log.log.pop_front();
+  }
+}
+
+
+
+
+
+ostream& PG::Log::print(ostream& out) const 
+{
+  out << *this << endl;
+  for (list<Entry>::const_iterator p = log.begin();
+       p != log.end();
+       p++) 
+    out << *p << endl;
+  return out;
+}
+
+
+
+
+
+/******* PG ***********/
+void PG::build_prior()
+{
+  // build prior set.
+  prior_set.clear();
+  
+  // current
+  for (unsigned i=1; i<acting.size(); i++)
+    prior_set.insert(acting[i]);
+
+  // and prior map(s), if OSDs are still up
+  for (epoch_t epoch = MAX(1, last_epoch_started_any);
+       epoch < osd->osdmap->get_epoch();
+       epoch++) {
+    OSDMap omap;
+    osd->get_map(epoch, omap);
+    
+    vector<int> acting;
+    omap.pg_to_acting_osds(get_pgid(), acting);
+    
+    for (unsigned i=0; i<acting.size(); i++) {
+      //dout(10) << "build prior considering epoch " << epoch << " osd" << acting[i] << endl;
+      if (osd->osdmap->is_up(acting[i]) &&  // is up now
+          acting[i] != osd->whoami)         // and is not me
+        prior_set.insert(acting[i]);
+    }
+  }
+
+  dout(10) << "build_prior built " << prior_set << endl;
+}
+
+void PG::adjust_prior()
+{
+  assert(!prior_set.empty());
+
+  // raise last_epoch_started_any
+  epoch_t max = 0;
+  for (map<int,Info>::iterator it = peer_info.begin();
+       it != peer_info.end();
+       it++) {
+    if (it->second.last_epoch_started > max)
+      max = it->second.last_epoch_started;
+  }
+
+  dout(10) << "adjust_prior last_epoch_started_any " 
+           << last_epoch_started_any << " -> " << max << endl;
+  assert(max > last_epoch_started_any);
+  last_epoch_started_any = max;
+
+  // rebuild prior set
+  build_prior();
+}
+
+
+void PG::clear_primary_state()
+{
+  dout(10) << "clear_primary_state" << endl;
+
+  // clear peering state
+  have_master_log = false;
+  prior_set.clear();
+  stray_set.clear();
+  clean_set.clear();
+  peer_info_requested.clear();
+  peer_log_requested.clear();
+  peer_info.clear();
+  peer_missing.clear();
+  
+  last_epoch_started_any = info.last_epoch_started;
+}
+
+void PG::peer(ObjectStore::Transaction& t, 
+              map< int, map<pg_t,Query> >& query_map)
+{
+  dout(10) << "peer.  acting is " << acting 
+           << ", prior_set is " << prior_set << endl;
+
+
+  /** GET ALL PG::Info *********/
+
+  // -- query info from everyone in prior_set.
+  bool missing_info = false;
+  for (set<int>::iterator it = prior_set.begin();
+       it != prior_set.end();
+       it++) {
+    if (peer_info.count(*it)) {
+      dout(10) << " have info from osd" << *it 
+               << ": " << peer_info[*it]
+               << endl;      
+      continue;
+    }
+    missing_info = true;
+
+    if (peer_info_requested.count(*it)) {
+      dout(10) << " waiting for osd" << *it << endl;
+      continue;
+    }
+    
+    dout(10) << " querying info from osd" << *it << endl;
+    query_map[*it][info.pgid] = Query(Query::INFO, info.history);
+    peer_info_requested.insert(*it);
+  }
+  if (missing_info) return;
+
+  
+  // -- ok, we have all (prior_set) info.  (and maybe others.)
+
+  // did we crash?
+  dout(10) << " last_epoch_started_any " << last_epoch_started_any << endl;
+  if (last_epoch_started_any) {
+    OSDMap omap;
+    osd->get_map(last_epoch_started_any, omap);
+    
+    // start with the last active set of replicas
+    set<int> last_started;
+    vector<int> acting;
+    omap.pg_to_acting_osds(get_pgid(), acting);
+    for (unsigned i=0; i<acting.size(); i++)
+      last_started.insert(acting[i]);
+
+    // make sure at least one of them is still up
+    for (epoch_t e = last_epoch_started_any+1;
+         e <= osd->osdmap->get_epoch();
+         e++) {
+      OSDMap omap;
+      osd->get_map(e, omap);
+      
+      set<int> still_up;
+
+      for (set<int>::iterator i = last_started.begin();
+           i != last_started.end();
+           i++) {
+        //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << endl;
+        if (omap.is_up(*i))
+          still_up.insert(*i);
+      }
+
+      last_started.swap(still_up);
+      //dout(10) << " still active as of epoch " << e << ": " << last_started << endl;
+    }
+    
+    if (last_started.empty()) {
+      dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+      state_set(STATE_CRASHED);
+    } else {
+      dout(10) << " still active from last started: " << last_started << endl;
+    }
+  } else if (osd->osdmap->get_epoch() > 1) {
+    dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+    state_set(STATE_CRASHED);
+  }    
+
+  dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+
+
+
+
+  /** CREATE THE MASTER PG::Log *********/
+
+  // who (of all priors and active) has the latest PG version?
+  eversion_t newest_update = info.last_update;
+  int        newest_update_osd = osd->whoami;
+  
+  oldest_update = info.last_update;  // only of acting (current) osd set.
+  peers_complete_thru = info.last_complete;
+  
+  for (map<int,Info>::iterator it = peer_info.begin();
+       it != peer_info.end();
+       it++) {
+    if (it->second.last_update > newest_update) {
+      newest_update = it->second.last_update;
+      newest_update_osd = it->first;
+    }
+    if (is_acting(it->first)) {
+      if (it->second.last_update < oldest_update) 
+        oldest_update = it->second.last_update;
+      if (it->second.last_complete < peers_complete_thru)
+        peers_complete_thru = it->second.last_complete;
+    }    
+  }
+
+  // gather log(+missing) from that person!
+  if (newest_update_osd != osd->whoami) {
+    if (peer_log_requested.count(newest_update_osd) ||
+        peer_summary_requested.count(newest_update_osd)) {
+      dout(10) << " newest update on osd" << newest_update_osd
+               << " v " << newest_update 
+               << ", already queried" 
+               << endl;
+    } else {
+      // we'd like it back to oldest_update, but will settle for log_bottom
+      eversion_t since = MAX(peer_info[newest_update_osd].log_bottom,
+                             oldest_update);
+      if (peer_info[newest_update_osd].log_bottom < log.top) {
+        dout(10) << " newest update on osd" << newest_update_osd
+                 << " v " << newest_update 
+                 << ", querying since " << since
+                 << endl;
+        query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history);
+        peer_log_requested.insert(newest_update_osd);
+      } else {
+        dout(10) << " newest update on osd" << newest_update_osd
+                 << " v " << newest_update 
+                 << ", querying entire summary/backlog"
+                 << endl;
+        assert((peer_info[newest_update_osd].last_complete >= 
+                peer_info[newest_update_osd].log_bottom) ||
+               peer_info[newest_update_osd].log_backlog);  // or else we're in trouble.
+        query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history);
+        peer_summary_requested.insert(newest_update_osd);
+      }
+    }
+    return;
+  } else {
+    dout(10) << " newest_update " << info.last_update << " (me)" << endl;
+  }
+
+  dout(10) << " oldest_update " << oldest_update << endl;
+
+  have_master_log = true;
+
+
+  // -- do i need to generate backlog for any of my peers?
+  if (oldest_update < log.bottom && !log.backlog) {
+    dout(10) << "generating backlog for some peers, bottom " 
+             << log.bottom << " > " << oldest_update
+             << endl;
+    generate_backlog();
+  }
+
+
+  /** COLLECT MISSING+LOG FROM PEERS **********/
+  /*
+    we also detect divergent replicas here by pulling the full log
+    from everyone.  
+  */  
+
+  // gather missing from peers
+  for (unsigned i=1; i<acting.size(); i++) {
+    int peer = acting[i];
+    if (peer_info[peer].is_empty()) continue;
+    if (peer_log_requested.count(peer) ||
+        peer_summary_requested.count(peer)) continue;
+
+    dout(10) << " pulling log+missing from osd" << peer
+             << endl;
+    query_map[peer][info.pgid] = Query(Query::FULLLOG, info.history);
+    peer_log_requested.insert(peer);
+  }
+
+  // did we get them all?
+  bool have_missing = true;
+  for (unsigned i=1; i<acting.size(); i++) {
+    int peer = acting[i];
+    if (peer_info[peer].is_empty()) continue;
+    if (peer_missing.count(peer)) continue;
+    
+    dout(10) << " waiting for log+missing from osd" << peer << endl;
+    have_missing = false;
+  }
+  if (!have_missing) return;
+
+  dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+
+  
+  // -- ok.  and have i located all pg contents?
+  if (missing.num_lost() > 0) {
+    dout(10) << "there are still " << missing.num_lost() << " lost objects" << endl;
+
+    // *****
+    // FIXME: i don't think this actually accomplishes anything!
+    // *****
+
+    // ok, let's get more summaries!
+    bool waiting = false;
+    for (map<int,Info>::iterator it = peer_info.begin();
+         it != peer_info.end();
+         it++) {
+      int peer = it->first;
+
+      if (peer_summary_requested.count(peer)) {
+        dout(10) << " already requested summary/backlog from osd" << peer << endl;
+        waiting = true;
+        continue;
+      }
+
+      dout(10) << " requesting summary/backlog from osd" << peer << endl;      
+      query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history);
+      peer_summary_requested.insert(peer);
+      waiting = true;
+    }
+    
+    if (!waiting) {
+      dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << endl;
+    }
+    return;
+  }
+
+  // sanity check
+  assert(missing.num_lost() == 0);
+  assert(info.last_complete >= log.bottom || log.backlog);
+
+
+  // -- crash recovery?
+  if (is_crashed()) {
+    dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl;
+    state_set(STATE_REPLAY);
+    g_timer.add_event_after(g_conf.osd_replay_window,
+                            new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
+  } 
+  else if (!is_active()) {
+    // -- ok, activate!
+    activate(t);
+  }
+}
+
+
+void PG::activate(ObjectStore::Transaction& t)
+{
+  assert(!is_active());
+
+  // twiddle pg state
+  state_set(STATE_ACTIVE);
+  state_clear(STATE_STRAY);
+  if (is_crashed()) {
+    assert(is_replay());
+    state_clear(STATE_CRASHED);
+    state_clear(STATE_REPLAY);
+  }
+  info.last_epoch_started = osd->osdmap->get_epoch();
+
+  if (role == 0) {    // primary state
+    peers_complete_thru = 0;  // we don't know (yet)!
+  }
+
+  assert(info.last_complete >= log.bottom || log.backlog);
+
+  // write pg info
+  t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info));
+  
+  // write log
+  write_log(t);
+
+  // clean up stray objects
+  clean_up_local(t); 
+
+  // init complete pointer
+  if (info.last_complete == info.last_update) {
+    dout(10) << "activate - complete" << endl;
+    log.complete_to == log.log.end();
+    log.requested_to = log.log.end();
+  } 
+  //else if (is_primary()) {
+  else if (true) {
+    dout(10) << "activate - not complete, " << missing << ", starting recovery" << endl;
+    
+    // init complete_to
+    log.complete_to = log.log.begin();
+    while (log.complete_to->version < info.last_complete) {
+      log.complete_to++;
+      assert(log.complete_to != log.log.end());
+    }
+    
+    // start recovery
+    log.requested_to = log.complete_to;
+    do_recovery();
+  } else {
+    dout(10) << "activate - not complete, " << missing << endl;
+  }
+
+
+  // if primary..
+  if (role == 0 &&
+      osd->osdmap->get_epoch() > 1) {
+    // who is clean?
+    clean_set.clear();
+    if (info.is_clean()) 
+      clean_set.insert(osd->whoami);
+    
+    // start up replicas
+    for (unsigned i=1; i<acting.size(); i++) {
+      int peer = acting[i];
+      assert(peer_info.count(peer));
+      
+      MOSDPGLog *m = new MOSDPGLog(osd->osdmap->get_epoch(), 
+                                   info.pgid);
+      m->info = info;
+      
+      if (peer_info[peer].last_update == info.last_update) {
+        // empty log
+      } 
+      else if (peer_info[peer].last_update < log.bottom) {
+        // summary/backlog
+        assert(log.backlog);
+        m->log = log;
+      } 
+      else {
+        // incremental log
+        assert(peer_info[peer].last_update < info.last_update);
+        m->log.copy_after(log, peer_info[peer].last_update);
+      }
+
+      // update local version of peer's missing list!
+      {
+        eversion_t plu = peer_info[peer].last_update;
+        Missing& pm = peer_missing[peer];
+        for (list<Log::Entry>::iterator p = m->log.log.begin();
+             p != m->log.log.end();
+             p++) 
+          if (p->version > plu)
+            pm.add(p->oid, p->version);
+      }
+      
+      dout(10) << "activate sending " << m->log << " " << m->missing
+               << " to osd" << peer << endl;
+      //m->log.print(cout);
+      osd->messenger->send_message(m, MSG_ADDR_OSD(peer), osd->osdmap->get_inst(peer));
+
+      // update our missing
+      if (peer_missing[peer].num_missing() == 0) {
+        dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << endl;
+        assert(peer_info[peer].last_complete == info.last_update);
+        clean_set.insert(peer);
+      } else {
+        dout(10) << "activate peer osd" << peer << " " << peer_info[peer]
+                 << " missing " << peer_missing[peer] << endl;
+      }
+            
+    }
+
+    // discard unneeded peering state
+    //peer_log.clear(); // actually, do this carefully, in case peer() is called again.
+    
+    // all clean?
+    if (is_all_clean()) {
+      state_set(STATE_CLEAN);
+      dout(10) << "activate all replicas clean" << endl;
+      clean_replicas();    
+    }
+  }
+
+  
+  // replay (queue them _before_ other waiting ops!)
+  if (!replay_queue.empty()) {
+    eversion_t c = info.last_update;
+    list<Message*> replay;
+    for (map<eversion_t,MOSDOp*>::iterator p = replay_queue.begin();
+         p != replay_queue.end();
+         p++) {
+      if (p->first <= info.last_update) {
+        dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << endl;
+        replay.push_back(p->second);
+        continue;
+      }
+      if (p->first.version != c.version+1) {
+        dout(10) << "activate replay " << p->first
+                 << " skipping " << c.version+1 - p->first.version 
+                 << " ops"
+                 << endl;      
+      }
+      dout(10) << "activate replay " << p->first << " " << *p->second << endl;
+      replay.push_back(p->second);
+      c = p->first;
+    }
+    replay_queue.clear();
+    osd->take_waiters(replay);
+  }
+
+  // waiters
+  osd->take_waiters(waiting_for_active);
+}
+
+/** clean_up_local
+ * remove any objects that we're storing but shouldn't.
+ * as determined by log.
+ */
+void PG::clean_up_local(ObjectStore::Transaction& t)
+{
+  dout(10) << "clean_up_local" << endl;
+
+  assert(info.last_update >= log.bottom);  // otherwise we need some help!
+
+  if (log.backlog) {
+    // be thorough.
+    list<object_t> ls;
+    osd->store->collection_list(info.pgid, ls);
+    set<object_t> s;
+    
+    for (list<object_t>::iterator i = ls.begin();
+         i != ls.end();
+         i++) 
+      s.insert(*i);
+
+    set<object_t> did;
+    for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+         p != log.log.rend();
+         p++) {
+      if (did.count(p->oid)) continue;
+      did.insert(p->oid);
+      
+      if (p->is_delete()) {
+        if (s.count(p->oid)) {
+          dout(10) << " deleting " << p->oid
+                   << " when " << p->version << endl;
+          t.remove(p->oid);
+        }
+        s.erase(p->oid);
+      } else {
+        // just leave old objects.. they're missing or whatever
+        s.erase(p->oid);
+      }
+    }
+
+    for (set<object_t>::iterator i = s.begin(); 
+         i != s.end();
+         i++) {
+      dout(10) << " deleting stray " << *i << endl;
+      t.remove(*i);
+    }
+
+  } else {
+    // just scan the log.
+    set<object_t> did;
+    for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+         p != log.log.rend();
+         p++) {
+      if (did.count(p->oid)) continue;
+      did.insert(p->oid);
+
+      if (p->is_delete()) {
+        dout(10) << " deleting " << p->oid
+                 << " when " << p->version << endl;
+        t.remove(p->oid);
+      } else {
+        // keep old(+missing) objects, just for kicks.
+      }
+    }
+  }
+}
+
+
+
+void PG::cancel_recovery()
+{
+  // forget about where missing items are, or anything we're pulling
+  missing.loc.clear();
+  osd->num_pulling -= objects_pulling.size();
+  objects_pulling.clear();
+}
+
+/**
+ * do one recovery op.
+ * return true if done, false if nothing left to do.
+ */
+bool PG::do_recovery()
+{
+  dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, "
+           << osd->num_pulling << "/" << g_conf.osd_max_pull << " total"
+           << endl;
+  dout(10) << "do_recovery " << missing << endl;
+
+  // can we slow down on this PG?
+  if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) {
+    dout(-10) << "do_recovery already pulling max, waiting" << endl;
+    return true;
+  }
+
+  // look at log!
+  Log::Entry *latest = 0;
+
+  while (log.requested_to != log.log.end()) {
+    assert(log.objects.count(log.requested_to->oid));
+    latest = log.objects[log.requested_to->oid];
+    assert(latest);
+
+    dout(10) << "do_recovery "
+             << *log.requested_to
+             << (objects_pulling.count(latest->oid) ? " (pulling)":"")
+             << endl;
+
+    if (latest->is_update() &&
+        !objects_pulling.count(latest->oid) &&
+        missing.is_missing(latest->oid)) {
+      osd->pull(this, latest->oid);
+      return true;
+    }
+    
+    log.requested_to++;
+  }
+
+  if (!objects_pulling.empty()) {
+    dout(7) << "do_recovery requested everything, still waiting" << endl;
+    return false;
+  }
+
+  // done?
+  assert(missing.num_missing() == 0);
+  assert(info.last_complete == info.last_update);
+  
+  if (is_primary()) {
+    // i am primary
+    dout(7) << "do_recovery complete, cleaning strays" << endl;
+    clean_set.insert(osd->whoami);
+    if (is_all_clean()) {
+      state_set(PG::STATE_CLEAN);
+      clean_replicas();
+    }
+  } else {
+    // tell primary
+    dout(7) << "do_recovery complete, telling primary" << endl;
+    list<PG::Info> ls;
+    ls.push_back(info);
+    osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(),
+                                                  ls),
+                                 MSG_ADDR_OSD(get_primary()), osd->osdmap->get_inst(get_primary()));
+  }
+
+  return false;
+}
+
+void PG::do_peer_recovery()
+{
+  dout(10) << "do_peer_recovery" << endl;
+
+  for (unsigned i=0; i<acting.size(); i++) {
+    int peer = acting[i];
+    if (peer_missing.count(peer) == 0 ||
+        peer_missing[peer].num_missing() == 0) 
+      continue;
+    
+    // oldest first!
+    object_t oid = peer_missing[peer].rmissing.begin()->second;
+    eversion_t v = peer_missing[peer].rmissing.begin()->first;
+
+    osd->push(this, oid, peer);
+
+    // do other peers need it too?
+    for (i++; i<acting.size(); i++) {
+      int peer = acting[i];
+      if (peer_missing.count(peer) &&
+          peer_missing[peer].is_missing(oid))
+        osd->push(this, oid, peer);
+    }
+
+    return;
+  }
+  
+  // nothing to do!
+}
+
+
+
+void PG::clean_replicas()
+{
+  dout(10) << "clean_replicas.  strays are " << stray_set << endl;
+  
+  for (set<int>::iterator p = stray_set.begin();
+       p != stray_set.end();
+       p++) {
+    dout(10) << "sending PGRemove to osd" << *p << endl;
+    set<pg_t> ls;
+    ls.insert(info.pgid);
+    MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls);
+    osd->messenger->send_message(m, MSG_ADDR_OSD(*p), osd->osdmap->get_inst(*p));
+  }
+
+  stray_set.clear();
+}
+
+
+
+void PG::write_log(ObjectStore::Transaction& t)
+{
+  // assemble buffer
+  bufferlist bl;
+  
+  // build buffer
+  ondisklog.bottom = 0;
+  ondisklog.block_map.clear();
+  for (list<Log::Entry>::iterator p = log.log.begin();
+       p != log.log.end();
+       p++) {
+    if (bl.length() % 4096 == 0)
+      ondisklog.block_map[bl.length()] = p->version;
+    bl.append((char*)&(*p), sizeof(*p));
+  }
+  ondisklog.top = bl.length();
+  
+  // write it
+  t.remove( object_t(1,info.pgid) );
+  t.write( object_t(1,info.pgid) , 0, bl.length(), bl);
+  t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+  t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+  
+  t.collection_setattr(info.pgid, "info", &info, sizeof(info)); 
+}
+
+void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) 
+{
+  dout(15) << "  trim_ondisk_log_to v " << v << endl;
+
+  map<off_t,eversion_t>::iterator p = ondisklog.block_map.begin();
+  while (p != ondisklog.block_map.end()) {
+    dout(15) << "    " << p->first << " -> " << p->second << endl;
+    p++;
+    if (p == ondisklog.block_map.end() ||
+        p->second > v) {  // too far!
+      p--;                // back up
+      break;
+    }
+  }
+  dout(15) << "  * " << p->first << " -> " << p->second << endl;
+  if (p == ondisklog.block_map.begin()) 
+    return;  // can't trim anything!
+  
+  // we can trim!
+  off_t trim = p->first;
+  dout(10) << "  trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+
+  ondisklog.bottom = trim;
+  
+  // adjust block_map
+  while (p != ondisklog.block_map.begin()) 
+    ondisklog.block_map.erase(ondisklog.block_map.begin());
+  
+  t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+  t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+}
+
+
+void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, 
+                    eversion_t trim_to)
+{
+  // write entry on disk
+  bufferlist bl;
+  bl.append( (char*)&logentry, sizeof(logentry) );
+  if (g_conf.osd_pad_pg_log) {  // pad to 4k, until i fix ebofs reallocation crap.  FIXME.
+    bufferptr bp(4096 - sizeof(logentry));
+    bl.push_back(bp);
+  }
+  t.write( object_t(1,info.pgid), ondisklog.top, bl.length(), bl );
+  
+  // update block map?
+  if (ondisklog.top % 4096 == 0) 
+    ondisklog.block_map[ondisklog.top] = logentry.version;
+  
+  ondisklog.top += bl.length();
+  t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+  
+  // trim?
+  if (trim_to > log.bottom) {
+    dout(10) << " trimming " << log << " to " << trim_to << endl;
+    log.trim(t, trim_to);
+    info.log_bottom = log.bottom;
+    info.log_backlog = log.backlog;
+    trim_ondisklog_to(t, trim_to);
+  }
+  dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+}
+
+void PG::read_log(ObjectStore *store)
+{
+  // load bounds
+  ondisklog.bottom = ondisklog.top = 0;
+  store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+  store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+  
+  log.backlog = info.log_backlog;
+  log.bottom = info.log_bottom;
+  
+  if (ondisklog.top > 0) {
+    // read
+    bufferlist bl;
+    store->read(object_t(1,info.pgid), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl);
+    
+    PG::Log::Entry e;
+    off_t pos = ondisklog.bottom;
+    while (pos < ondisklog.top) {
+      bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e);
+      if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom
+        if (pos % 4096 == 0)
+          ondisklog.block_map[pos] = e.version;
+        log.log.push_back(e);
+      }
+      
+      pos += sizeof(e);
+    }
+  }
+  log.top = info.last_update;
+  log.index();
+
+  // build missing
+  set<object_t> did;
+  for (list<Log::Entry>::reverse_iterator i = log.log.rbegin();
+       i != log.log.rend();
+       i++) {
+    if (i->version <= info.last_complete) break;
+    if (did.count(i->oid)) continue;
+    did.insert(i->oid);
+
+    if (i->is_delete()) continue;
+
+    eversion_t v;
+    int r = osd->store->getattr(i->oid, "version", &v, sizeof(v));
+    if (r < 0 || v < i->version) 
+      missing.add(i->oid, i->version);
+  }
+}
+
diff --git a/branches/sage/cephmds2/osd/PG.h b/branches/sage/cephmds2/osd/PG.h
new file mode 100644
index 0000000000000..f8a040346e88e
--- /dev/null
+++ b/branches/sage/cephmds2/osd/PG.h
@@ -0,0 +1,735 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __PG_H
+#define __PG_H
+
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+#include "OSDMap.h"
+#include "ObjectStore.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOpReply.h"
+
+#include "include/types.h"
+
+#include <list>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+class OSD;
+
+/* reqid_t - caller + tid to unique identify this request
+ */
+class reqid_t {
+public:
+  msg_addr_t addr;
+  tid_t      tid;
+  reqid_t() : tid(0) {}
+  reqid_t(const msg_addr_t& a, tid_t t) : addr(a), tid(t) {}
+};
+
+inline ostream& operator<<(ostream& out, const reqid_t& r) {
+  return out << r.addr << "." << r.tid;
+}
+inline bool operator==(const reqid_t& l, const reqid_t& r) {
+  return (l.addr == r.addr) && (l.tid == r.tid);
+}
+inline bool operator!=(const reqid_t& l, const reqid_t& r) {
+  return (l.addr != r.addr) || (l.tid != r.tid);
+}
+
+namespace __gnu_cxx {
+  template<> struct hash<reqid_t> {
+    size_t operator()(const reqid_t &r) const { 
+      static hash<unsigned long> H;
+      static hash<__uint64_t>    I;
+      return H(r.addr._addr) ^ I(r.tid);
+    }
+  };
+}
+
+/** PG - Replica Placement Group
+ *
+ */
+
+class PG {
+public:
+  
+  /*
+   * PG::Info - summary of PG statistics.
+   *
+   * some notes: 
+   *  - last_complete implies we have all objects that existed as of that
+   *    stamp, OR a newer object, OR have already applied a later delete.
+   *  - if last_complete >= log.bottom, then we know pg contents thru log.top.
+   *    otherwise, we have no idea what the pg is supposed to contain.
+   */
+  struct Info {
+    pg_t pgid;
+    eversion_t last_update;    // last object version applied to store.
+    eversion_t last_complete;  // last version pg was complete through.
+
+    eversion_t log_bottom;     // oldest log entry.
+    bool       log_backlog;    // do we store a complete log?
+
+    epoch_t last_epoch_started;  // last epoch started.
+    epoch_t last_epoch_finished; // last epoch finished.
+
+    struct History {
+      epoch_t same_since;          // same acting set since
+      epoch_t same_primary_since;  // same primary at least back through this epoch.
+      epoch_t same_acker_since;    // same acker at least back through this epoch.
+      History() : same_since(0), same_primary_since(0), same_acker_since(0) {}
+    } history;
+    
+    Info(pg_t p=0) : pgid(p), 
+                     log_backlog(false),
+                     last_epoch_started(0), last_epoch_finished(0) {}
+    bool is_clean() const { return last_update == last_complete; }
+    bool is_empty() const { return last_update.version == 0; }
+  };
+  
+  
+  /** 
+   * Query - used to ask a peer for information about a pg.
+   *
+   * note: if version=0, type=LOG, then we just provide our full log.
+   *   only if type=BACKLOG do we generate a backlog and provide that too.
+   */
+  struct Query {
+    const static int INFO = 0;
+    const static int LOG = 1;
+    const static int BACKLOG = 2;
+    const static int FULLLOG = 3;
+
+    int type;
+    eversion_t split, floor;
+    Info::History history;
+
+    Query() : type(-1) {}
+    Query(int t, Info::History& h) : 
+      type(t), history(h) { assert(t != LOG); }
+    Query(int t, eversion_t s, eversion_t f, Info::History& h) : 
+      type(t), split(s), floor(f), history(h) { assert(t == LOG); }
+  };
+  
+  
+  /*
+   * Missing - summary of missing objects.
+   *  kept in memory, as a supplement to Log.
+   *  also used to pass missing info in messages.
+   */
+  class Missing {
+  public:
+    map<object_t, eversion_t> missing;   // oid -> v
+    map<eversion_t, object_t> rmissing;  // v -> oid
+
+    map<object_t, int>       loc;       // where i think i can get them.
+
+    int num_lost() const { return missing.size() - loc.size(); }
+    int num_missing() const { return missing.size(); }
+
+    bool is_missing(object_t oid) {
+      return missing.count(oid);
+    }
+    bool is_missing(object_t oid, eversion_t v) {
+      return missing.count(oid) && missing[oid] <= v;
+    }
+    void add(object_t oid) {
+      eversion_t z;
+      add(oid,z);
+    }
+    void add(object_t oid, eversion_t v) {
+      if (missing.count(oid)) {
+        if (missing[oid] > v) return;   // already missing newer.
+        rmissing.erase(missing[oid]);
+      }
+      missing[oid] = v;
+      rmissing[v] = oid;
+    }
+    void rm(object_t oid, eversion_t when) {
+      if (missing.count(oid) && missing[oid] < when) {
+        rmissing.erase(missing[oid]);
+        missing.erase(oid);
+        loc.erase(oid);
+      }        
+    }
+    void got(object_t oid, eversion_t v) {
+      assert(missing.count(oid));
+      assert(missing[oid] <= v);
+      loc.erase(oid);
+      rmissing.erase(missing[oid]);
+      missing.erase(oid);
+    }
+    void got(object_t oid) {
+      assert(missing.count(oid));
+      loc.erase(oid);
+      rmissing.erase(missing[oid]);
+      missing.erase(oid);
+    }
+
+    void _encode(bufferlist& blist) {
+      ::_encode(missing, blist);
+      ::_encode(loc, blist);
+    }
+    void _decode(bufferlist& blist, int& off) {
+      ::_decode(missing, blist, off);
+      ::_decode(loc, blist, off);
+
+      for (map<object_t,eversion_t>::iterator it = missing.begin();
+           it != missing.end();
+           it++) 
+        rmissing[it->second] = it->first;
+    }
+  };
+
+
+  /*
+   * Log - incremental log of recent pg changes.
+   *  also, serves as a recovery queue.
+   *
+   * when backlog is true, 
+   *  objects with versions <= bottom are in log.
+   *  we do not have any deletion info before that time, however.
+   *  log is a "summary" in that it contains all objects in the PG.
+   */
+  class Log {
+  public:
+    /** top, bottom
+     *    top - newest entry (update|delete)
+     * bottom - entry previous to oldest (update|delete) for which we have
+     *          complete negative information.  
+     * i.e. we can infer pg contents for any store whose last_update >= bottom.
+     */
+    eversion_t top;       // newest entry (update|delete)
+    eversion_t bottom;    // version prior to oldest (update|delete) 
+
+    /** backlog - true if log is a complete summary of pg contents.  
+     * updated will include all items in pg, but deleted will not include
+     * negative entries for items deleted prior to 'bottom'.
+     */
+    bool      backlog;
+    
+    /** Entry
+     * mapped from the eversion_t, so don't include that.
+     */
+    class Entry {
+    public:
+      const static int LOST = 0;
+      const static int MODIFY = 1;
+      const static int CLONE = 2;  
+      const static int DELETE = 3;
+
+      int        op;   // write, zero, trunc, remove
+      object_t   oid;
+      eversion_t version;
+      objectrev_t rev;
+      
+      reqid_t    reqid;  // caller+tid to uniquely identify request
+      
+      Entry() : op(0) {}
+      Entry(int _op, object_t _oid, const eversion_t& v, 
+            const msg_addr_t& a, tid_t t) : 
+        op(_op), oid(_oid), version(v), reqid(a,t) {}
+      
+      bool is_delete() const { return op == DELETE; }
+      bool is_clone() const { return op == CLONE; }
+      bool is_modify() const { return op == MODIFY; }
+      bool is_update() const { return is_clone() || is_modify(); }
+    };
+
+    list<Entry> log;  // the actual log.
+
+    Log() : backlog(false) {}
+
+    void clear() {
+      eversion_t z;
+      top = bottom = z;
+      backlog = false;
+      log.clear();
+    }
+    bool empty() const {
+      return top.version == 0 && top.epoch == 0;
+    }
+
+    void _encode(bufferlist& blist) const {
+      blist.append((char*)&top, sizeof(top));
+      blist.append((char*)&bottom, sizeof(bottom));
+      blist.append((char*)&backlog, sizeof(backlog));
+      ::_encode(log, blist);
+    }
+    void _decode(bufferlist& blist, int& off) {
+      blist.copy(off, sizeof(top), (char*)&top);
+      off += sizeof(top);
+      blist.copy(off, sizeof(bottom), (char*)&bottom);
+      off += sizeof(bottom);
+      blist.copy(off, sizeof(backlog), (char*)&backlog);
+      off += sizeof(backlog);
+
+      ::_decode(log, blist, off);
+    }
+
+    void copy_after(const Log &other, eversion_t v);
+    bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor);
+    void copy_non_backlog(const Log &other);
+    ostream& print(ostream& out) const;
+  };
+
+  /**
+   * IndexLog - adds in-memory index of the log, by oid.
+   * plus some methods to manipulate it all.
+   */
+  class IndexedLog : public Log {
+  public:
+    hash_map<object_t,Entry*> objects;  // ptrs into log.  be careful!
+    hash_set<reqid_t>         caller_ops;
+
+    // recovery pointers
+    list<Entry>::iterator requested_to; // not inclusive of referenced item
+    list<Entry>::iterator complete_to;  // not inclusive of referenced item
+    
+    /****/
+    IndexedLog() {}
+
+    void clear() {
+      assert(0);
+      unindex();
+      Log::clear();
+    }
+
+    bool logged_object(object_t oid) {
+      return objects.count(oid);
+    }
+    bool logged_req(reqid_t &r) {
+      return caller_ops.count(r);
+    }
+
+    void index() {
+      objects.clear();
+      caller_ops.clear();
+      for (list<Entry>::iterator i = log.begin();
+           i != log.end();
+           i++) {
+        objects[i->oid] = &(*i);
+        caller_ops.insert(i->reqid);
+      }
+    }
+
+    void index(Entry& e) {
+      if (objects.count(e.oid) == 0 || 
+          objects[e.oid]->version < e.version)
+        objects[e.oid] = &e;
+      caller_ops.insert(e.reqid);
+    }
+    void unindex() {
+      objects.clear();
+      caller_ops.clear();
+    }
+    void unindex(Entry& e) {
+      // NOTE: this only works if we remove from the _bottom_ of the log!
+      assert(objects.count(e.oid));
+      if (objects[e.oid]->version == e.version)
+        objects.erase(e.oid);
+      caller_ops.erase(e.reqid);
+    }
+
+
+    // accessors
+    Entry *is_updated(object_t oid) {
+      if (objects.count(oid) && objects[oid]->is_update()) return objects[oid];
+      return 0;
+    }
+    Entry *is_deleted(object_t oid) {
+      if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid];
+      return 0;
+    }
+    
+    // actors
+    void add(Entry& e) {
+      // add to log
+      log.push_back(e);
+      assert(e.version > top);
+      assert(top.version == 0 || e.version.version > top.version);
+      top = e.version;
+
+      // to our index
+      objects[e.oid] = &(log.back());
+      caller_ops.insert(e.reqid);
+    }
+
+    void trim(ObjectStore::Transaction &t, eversion_t s);
+    void trim_write_ahead(eversion_t last_update);
+  };
+  
+
+  /**
+   * OndiskLog - some info about how we store the log on disk.
+   */
+  class OndiskLog {
+  public:
+    // ok
+    off_t bottom;                     // first byte of log. 
+    off_t top;                        // byte following end of log.
+    map<off_t,eversion_t> block_map;  // block -> first stamp logged there
+
+    OndiskLog() : bottom(0), top(0) {}
+
+    bool trim_to(eversion_t v, ObjectStore::Transaction& t);
+  };
+
+
+  /***
+   */
+
+  class RepOpGather {
+  public:
+    class MOSDOp *op;
+    tid_t rep_tid;
+
+    ObjectStore::Transaction t;
+    bool applied;
+
+    set<int>  waitfor_ack;
+    set<int>  waitfor_commit;
+    
+    utime_t   start;
+
+    bool sent_ack, sent_commit;
+    
+    set<int>         osds;
+    eversion_t       new_version;
+
+    eversion_t       pg_local_last_complete;
+    map<int,eversion_t> pg_complete_thru;
+    
+    RepOpGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) :
+      op(o), rep_tid(rt),
+      applied(false),
+      sent_ack(false), sent_commit(false),
+      new_version(nv), 
+      pg_local_last_complete(lc) { }
+
+    bool can_send_ack() { 
+      return !sent_ack && !sent_commit &&
+        waitfor_ack.empty(); 
+    }
+    bool can_send_commit() { 
+      return !sent_commit &&
+        waitfor_ack.empty() && waitfor_commit.empty(); 
+    }
+    bool can_delete() { 
+      return waitfor_ack.empty() && waitfor_commit.empty(); 
+    }
+  };
+
+
+  /*** PG ****/
+public:
+  // any
+  static const int STATE_ACTIVE = 1; // i am active.  (primary: replicas too)
+  
+  // primary
+  static const int STATE_CLEAN =  2;  // peers are complete, clean of stray replicas.
+  static const int STATE_CRASHED = 4; // all replicas went down.
+  static const int STATE_REPLAY = 8;  // crashed, waiting for replay
+ 
+  // non-primary
+  static const int STATE_STRAY =  16; // i must notify the primary i exist.
+
+
+ protected:
+  OSD *osd;
+
+public:
+  // pg state
+  Info        info;
+  IndexedLog  log;
+  OndiskLog   ondisklog;
+  Missing     missing;
+  utime_t     last_heartbeat;  // 
+
+protected:
+  int         role;    // 0 = primary, 1 = replica, -1=none.
+  int         state;   // see bit defns above
+
+  // primary state
+ public:
+  vector<int> acting;
+  epoch_t     last_epoch_started_any;
+  eversion_t  last_complete_commit;
+
+  // [primary only] content recovery state
+  eversion_t  peers_complete_thru;
+  bool        have_master_log;
+ protected:
+  set<int>    prior_set;   // current+prior OSDs, as defined by last_epoch_started_any.
+  set<int>    stray_set;   // non-acting osds that have PG data.
+  set<int>    clean_set;   // current OSDs that are clean
+  eversion_t  oldest_update; // lowest (valid) last_update in active set
+  map<int,Info>        peer_info;   // info from peers (stray or prior)
+  set<int>             peer_info_requested;
+  map<int, Missing>    peer_missing;
+  set<int>             peer_log_requested;  // logs i've requested (and start stamps)
+  set<int>             peer_summary_requested;
+  friend class OSD;
+
+
+  // [primary|tail]
+  // old way
+  map<tid_t, class OSDReplicaOp*> replica_ops;
+  map<int, set<tid_t> >           replica_tids_by_osd; // osd -> (tid,...)
+
+  // new way
+  map<tid_t, RepOpGather*>          repop_gather;
+  map<tid_t, list<class Message*> > waiting_for_repop;
+
+  
+  // [primary|replica]
+  // pg waiters
+  list<class Message*>            waiting_for_active;
+  hash_map<object_t, 
+           list<class Message*> > waiting_for_missing_object;   
+  map<eversion_t,class MOSDOp*>   replay_queue;
+  
+  // recovery
+  map<object_t, eversion_t> objects_pulling;  // which objects are currently being pulled
+  
+public:
+  void clear_primary_state();
+
+ public:
+  bool is_acting(int osd) const { 
+    for (unsigned i=0; i<acting.size(); i++)
+      if (acting[i] == osd) return true;
+    return false;
+  }
+  bool is_prior(int osd) const { return prior_set.count(osd); }
+  bool is_stray(int osd) const { return stray_set.count(osd); }
+  
+  bool is_all_clean() const { return clean_set.size() == acting.size(); }
+
+  void build_prior();
+  void adjust_prior();  // based on new peer_info.last_epoch_started_any
+
+  bool adjust_peers_complete_thru() {
+    eversion_t t = info.last_complete;
+    for (unsigned i=1; i<acting.size(); i++) 
+      if (peer_info[i].last_complete < t)
+        t = peer_info[i].last_complete;
+    if (t > peers_complete_thru) {
+      peers_complete_thru = t;
+      return true;
+    }
+    return false;
+  }
+
+  void proc_replica_log(Log &olog, Missing& omissing, int from);
+  void merge_log(Log &olog, Missing& omissing, int from);
+  void proc_missing(Log &olog, Missing &omissing, int fromosd);
+  
+  void generate_backlog();
+  void drop_backlog();
+  
+  void trim_write_ahead();
+
+  void peer(ObjectStore::Transaction& t, map< int, map<pg_t,Query> >& query_map);
+
+  void activate(ObjectStore::Transaction& t);
+
+  void cancel_recovery();
+  bool do_recovery();
+  void do_peer_recovery();
+
+  void clean_replicas();
+
+  off_t get_log_write_pos() {
+    return 0;
+  }
+
+ public:  
+  PG(OSD *o, pg_t p) : 
+    osd(o), 
+    info(p),
+    role(0),
+    state(0),
+    last_epoch_started_any(0),
+    last_complete_commit(0),
+    peers_complete_thru(0),
+    have_master_log(true)
+  { }
+  
+  pg_t       get_pgid() const { return info.pgid; }
+  int        get_nrep() const { return acting.size(); }
+
+  int        get_primary() { return acting.empty() ? -1:acting[0]; }
+  //int        get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; }
+  //int        get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); }
+  int        get_acker() { 
+    if (g_conf.osd_rep == OSD_REP_PRIMARY ||
+	acting.size() <= 1) 
+      return get_primary();
+    return acting[1];
+  }
+  
+  int        get_role() const { return role; }
+  void       set_role(int r) { role = r; }
+
+  bool       is_primary() const { return role == PG_ROLE_HEAD; }
+  bool       is_acker() const { return role == PG_ROLE_ACKER; }
+  bool       is_head() const { return role == PG_ROLE_HEAD; }
+  bool       is_middle() const { return role == PG_ROLE_MIDDLE; }
+  bool       is_residual() const { return role == PG_ROLE_STRAY; }
+  
+  //int  get_state() const { return state; }
+  bool state_test(int m) const { return (state & m) != 0; }
+  void state_set(int m) { state |= m; }
+  void state_clear(int m) { state &= ~m; }
+
+  bool is_complete() const { return info.last_complete == info.last_update; }
+
+  bool       is_active() const { return state_test(STATE_ACTIVE); }
+  bool       is_crashed() const { return state_test(STATE_CRASHED); }
+  bool       is_replay() const { return state_test(STATE_REPLAY); }
+  //bool       is_complete()    { return state_test(STATE_COMPLETE); }
+  bool       is_clean() const { return state_test(STATE_CLEAN); }
+  bool       is_stray() const { return state_test(STATE_STRAY); }
+
+  bool  is_empty() const { return info.last_update == 0; }
+
+  int num_active_ops() const {
+    return objects_pulling.size();
+  }
+
+
+  // pg on-disk content
+  void clean_up_local(ObjectStore::Transaction& t);
+
+  // pg on-disk state
+  void write_log(ObjectStore::Transaction& t);
+  void append_log(ObjectStore::Transaction& t, 
+                  PG::Log::Entry& logentry, 
+                  eversion_t trim_to);
+  void read_log(ObjectStore *store);
+  void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v);
+
+
+  
+};
+
+
+
+inline ostream& operator<<(ostream& out, const PG::Info::History& h) 
+{
+  return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Info& pgi) 
+{
+  out << "pginfo(" << hex << pgi.pgid << dec;
+  if (pgi.is_empty())
+    out << " empty";
+  else
+    out << " v " << pgi.last_update << "/" << pgi.last_complete
+        << " (" << pgi.log_bottom << "," << pgi.last_update << "]"
+        << (pgi.log_backlog ? "+backlog":"");
+  out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished
+      << " " << pgi.history
+      << ")";
+  return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Log::Entry& e)
+{
+  return out << " " << e.version 
+             << (e.is_delete() ? " - ":
+		 (e.is_clone() ? " c ":
+		  (e.is_modify() ? " m ":
+		   " ? ")))
+             << e.oid << " by " << e.reqid;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Log& log) 
+{
+  out << "log(" << log.bottom << "," << log.top << "]";
+  if (log.backlog) out << "+backlog";
+  return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Missing& missing) 
+{
+  out << "missing(" << missing.num_missing();
+  if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
+  out << ")";
+  return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG& pg)
+{
+  out << "pg[" << pg.info 
+      << " r=" << pg.get_role();
+
+  if (pg.log.bottom != pg.info.log_bottom)
+    out << " (info mismatch, " << pg.log << ")";
+
+  if (pg.log.log.empty()) {
+    // shoudl it be?
+    if (pg.log.top.version - pg.log.bottom.version != 0) {
+      out << " (log bound mismatch, empty)";
+    }
+  } else {
+    if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) &&
+         !pg.log.backlog) ||
+        (pg.log.log.rbegin()->version.version != pg.log.top.version)) {
+      out << " (log bound mismatch, actual=["
+          << pg.log.log.begin()->version << ","
+          << pg.log.log.rbegin()->version << "])";
+    }
+  }
+
+  if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru;
+  if (!pg.have_master_log) out << " !hml";
+  if (pg.is_active()) out << " active";
+  if (pg.is_crashed()) out << " crashed";
+  if (pg.is_replay()) out << " replay";
+  if (pg.is_clean()) out << " clean";
+  if (pg.is_stray()) out << " stray";
+  //out << " (" << pg.log.bottom << "," << pg.log.top << "]";
+  if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing();
+  if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost();
+  out << "]";
+
+
+  return out;
+}
+
+
+inline ostream& operator<<(ostream& out, PG::RepOpGather& repop)
+{
+  out << "repop(" << &repop << " rep_tid=" << repop.rep_tid 
+      << " wfack=" << repop.waitfor_ack
+      << " wfcommit=" << repop.waitfor_commit;
+  out << " pct=" << repop.pg_complete_thru;
+  out << " op=" << *(repop.op);
+  out << " repop=" << &repop;
+  out << ")";
+  return out;
+}
+
+
+#endif
diff --git a/branches/sage/cephmds2/osd/rush.cc b/branches/sage/cephmds2/osd/rush.cc
new file mode 100644
index 0000000000000..aebca7ac1a351
--- /dev/null
+++ b/branches/sage/cephmds2/osd/rush.cc
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+//
+//
+//    rush.cc
+//
+// $Id$
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <cassert>
+#include "rush.h"
+
+
+static
+unsigned int
+myhash (unsigned int n)
+{
+  unsigned int v = (n ^ 0xdead1234) * (884811920 * 3  + 1);
+  return (v);
+}
+
+Rush::Rush ()
+{
+  nClusters = 0;
+  totalServers = 0;
+}
+
+//----------------------------------------------------------------------
+//
+//    Rush::AddCluster
+//
+//    Add a cluster.  The number of servers in the cluster and
+//    the weight of each server is passed.  The current number of
+//    clusters is returned.
+//
+//----------------------------------------------------------------------
+int
+Rush::AddCluster (int nServers, double weight)
+{
+  clusterSize[nClusters] = nServers;
+  clusterWeight[nClusters] = weight;
+  if (nClusters == 0) {
+    serversInPrevious[0] = 0;
+    totalWeightBefore[0] = 0.0;
+  } else {
+    serversInPrevious[nClusters] = serversInPrevious[nClusters-1] +
+      clusterSize[nClusters-1];
+    totalWeightBefore[nClusters] =
+      totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] *
+      clusterWeight[nClusters-1];
+  }
+  nClusters += 1;
+  totalServers += nServers;
+#if 0
+  for (int i = 0; i < nClusters; i++) {
+    fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n",
+        clusterSize[i], serversInPrevious[i], clusterWeight[i],
+        totalWeightBefore[i]);
+  }
+#endif
+  return (nClusters);
+}
+
+
+//----------------------------------------------------------------------
+//
+//    Rush::GetServersByKey
+//
+//    This function returns a list of servers on which an object
+//    should be placed.  The servers array must be large enough to
+//    contain the list.
+//
+//----------------------------------------------------------------------
+void
+Rush::GetServersByKey (int key, int nReplicas, int servers[])
+{
+  int    replicasLeft = nReplicas;
+  int    cluster;
+  int    mustAssign, numberAssigned;
+  int    i, toDraw;
+  int    *srv = servers;
+  double    myWeight;
+  RushRNG    rng;
+
+  // There may not be more replicas than servers!
+  assert (nReplicas <= totalServers);
+  
+  for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) {
+    if (serversInPrevious[cluster] < replicasLeft) {
+      mustAssign = replicasLeft - serversInPrevious[cluster];
+    } else {
+      mustAssign = 0;
+    }
+    toDraw = replicasLeft - mustAssign;
+    if (toDraw > (clusterSize[cluster] - mustAssign)) {
+      toDraw = clusterSize[cluster] - mustAssign;
+    }
+    myWeight = (double)clusterSize[cluster] * clusterWeight[cluster];
+    rng.Seed (myhash (key)^cluster, cluster^0xb90738);
+    numberAssigned = mustAssign +
+      rng.HyperGeometricWeighted (toDraw, myWeight,
+                                  totalWeightBefore[cluster] + myWeight,
+                                  clusterWeight[cluster]);
+    if (numberAssigned > 0) {
+      rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937);
+      rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]);
+      for (i = 0; i < numberAssigned; i++) {
+        srv[i] += serversInPrevious[cluster];
+      }
+      replicasLeft -= numberAssigned;
+      srv += numberAssigned;
+    }
+  }
+}
+
+
+
+//----------------------------------------------------------------------
+//
+//    RushRNG::HyperGeometricWeighted
+//
+//    Use an iterative method to generate a hypergeometric random
+//    variable.  This approach guarantees that, if the number of draws
+//    is reduced, the number of successes must be as well as long as
+//    the seed for the RNG is the same.
+//
+//----------------------------------------------------------------------
+int
+RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted,
+                 double totalWeighted, double weightOne)
+{
+  int    positives = 0, i;
+  double    curRand;
+
+  // If the weight is too small (or is negative), choose zero objects.
+  if (weightOne <= 1e-9 || nDraws == 0) {
+    return (0);
+  }
+
+  // Draw nDraws items from the "bag".  For each positive, subtract off
+  // the weight of an object from the weight of positives remaining.  For
+  // each draw, subtract off the weight of an object from the total weight
+  // remaining.
+  for (i = 0; i < nDraws; i++) {
+    curRand = RandomDouble ();
+    if (curRand < (yesWeighted / totalWeighted)) {
+      positives += 1;
+      yesWeighted -= weightOne;
+    }
+    totalWeighted -= weightOne;
+  }
+  return (positives);
+}
+
+//----------------------------------------------------------------------
+//
+//    RushRNG::DrawKofN
+//
+//----------------------------------------------------------------------
+void
+RushRNG::DrawKofN (int vals[], int nToDraw, int setSize)
+{
+  int    deck[setSize];
+  int    i, pick;
+
+  assert(nToDraw <= setSize);
+
+  for (i = 0; i < setSize; i++) {
+    deck[i] = i;
+  }
+
+  for (i = 0; i < nToDraw; i++) {
+    pick = (int)(RandomDouble () * (double)(setSize - i));
+    if (pick >= setSize-i) pick = setSize-i-1;  // in case
+    //    assert(i >= 0 && i < nToDraw);
+    //    assert(pick >= 0 && pick < setSize);
+    vals[i] = deck[pick];
+    deck[pick] = deck[setSize-i-1];
+  }
+}
+
+#define    SEED_X 521288629
+#define    SEED_Y 362436069
+RushRNG::RushRNG ()
+{
+  Seed (0, 0);
+}
+
+void
+RushRNG::Seed (unsigned int seed1, unsigned int seed2)
+{
+  state1 = ((seed1 == 0) ? SEED_X : seed1);
+  state2 = ((seed2 == 0) ? SEED_Y : seed2);
+}
+
+unsigned int
+RushRNG::RandomInt ()
+{
+  const unsigned int a = 18000;
+  const unsigned int b = 18879;
+  unsigned int    rndValue;
+
+  state1 = a * (state1 & 0xffff) + (state1 >> 16);
+  state2 = b * (state2 & 0xffff) + (state2 >> 16);
+  rndValue = (state1 << 16) + (state2 & 0xffff);
+  return (rndValue);
+}
+
+double
+RushRNG::RandomDouble ()
+{
+  double    v;
+
+  v = (double)RandomInt() / (65536.0*65536.0);
+  return (v);
+}
diff --git a/branches/sage/cephmds2/osd/rush.h b/branches/sage/cephmds2/osd/rush.h
new file mode 100644
index 0000000000000..3d880a32415e0
--- /dev/null
+++ b/branches/sage/cephmds2/osd/rush.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+//
+//
+//    rush.h
+//
+//    Classes and definitions for the RUSH algorithm.
+//
+// $Id$
+//
+//
+
+#ifndef    _rush_h_
+#define    _rush_h_
+
+#define    RUSH_MAX_CLUSTERS    100
+
+class RushRNG {
+public:
+  unsigned int    RandomInt ();
+  double RandomDouble ();
+  void    Seed (unsigned int a, unsigned int b);
+  int    HyperGeometricWeighted (int nDraws, double yesWeighted,
+                double totalWeighted, double weightOne);
+  void    DrawKofN (int vals[], int nToDraw, int setSize);
+  RushRNG();
+private:
+  unsigned int state1, state2;
+};
+
+class Rush {
+public:
+  void    GetServersByKey (int key, int nReplicas, int servers[]);
+  int    AddCluster (int nServers, double weight);
+  int    Clusters () {return (nClusters);}
+  int    Servers () {return (totalServers);}
+  Rush ();
+private:
+  int    DrawKofN (int *servers, int n, int clusterSize, RushRNG *g);
+  int    nClusters;
+  int    totalServers;
+  int    clusterSize[RUSH_MAX_CLUSTERS];
+  int    serversInPrevious[RUSH_MAX_CLUSTERS];
+  double clusterWeight[RUSH_MAX_CLUSTERS];
+  double totalWeightBefore[RUSH_MAX_CLUSTERS];
+};
+
+#endif    /* _rush_h_ */
diff --git a/branches/sage/cephmds2/osd/tp.cc b/branches/sage/cephmds2/osd/tp.cc
new file mode 100644
index 0000000000000..c8171895beef0
--- /dev/null
+++ b/branches/sage/cephmds2/osd/tp.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+#include "common/Mutex.h"
+#include "common/ThreadPool.h"
+// #include <thread.h>
+
+class Op {
+  int i;
+
+public:
+ 
+  Op(int i)
+  {
+    this->i = i;
+  }
+
+  int get()
+  {
+    return i;
+  }
+};
+
+void foop(class TP *t, class Op *o);
+
+class TP {
+public:
+
+  void foo(Op *o)
+  {
+    cout << "Thread "<< pthread_self() << ": " << o->get() << "\n";
+    usleep(1);
+    
+    //  sched_yield();
+  }
+
+  int main(int argc, char *argv)
+  {
+    ThreadPool<TP,Op> *t = new ThreadPool<TP,Op>(10, (void (*)(TP*, Op*))foop, this);
+    
+    for(int i = 0; i < 100; i++) {
+      Op *o = new Op(i); 
+      t->put_op(o);
+    }
+    
+    sleep(1);
+    
+    delete(t);
+    
+    return 0;
+  }
+};
+
+void foop(class TP *t, class Op *o) {
+  t->foo(o);
+}
+
+int main(int argc, char *argv) {
+  TP t;
+
+  t.main(argc,argv);
+}
+
diff --git a/branches/sage/cephmds2/osdc/Blinker.h b/branches/sage/cephmds2/osdc/Blinker.h
new file mode 100644
index 0000000000000..231fe47fb1e31
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Blinker.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __BLINKER_H
+#define __BLINKER_H
+
+class Blinker {
+
+ public:
+
+  class Op {
+    int op;
+    static const int LOOKUP = 1;
+    static const int INSERT = 2;
+    static const int REMOVE = 3;
+    static const int CLEAR = 4;
+    Op(int o) : op(o) {}
+  };
+  
+  class OpLookup : public Op {
+  public:
+    bufferptr key;
+    OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {}
+  };
+
+  class OpInsert : public Op {
+    bufferptr key;
+    bufferlist val;
+    OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {}
+  };
+
+  class OpRemove : public Op {
+  public:
+    bufferptr key;
+    OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {}
+  };
+
+  class OpClear : public Op {
+  public:
+    OpClear() : Op(Op::CLEAR) {}
+  };
+
+
+
+private:
+  Objecter *objecter;
+
+  // in-flight operations.
+
+
+  // cache information about tree structure.
+  
+
+
+public:
+  // public interface
+
+  // simple accessors
+  void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish);
+
+  // simple modifiers
+  void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe);
+  void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe);
+  void clear(inode_t& inode, Context *onack, Context *onsafe);
+
+  // these are dangerous: the table may be large.
+  void listkeys(inode_t& inode, list<bufferptr>* pkeys, Context *onfinish);
+  void listvals(inode_t& inode, list<bufferptr>* pkeys, list<bufferlist>* pvals, Context *onfinish);
+
+  // fetch *at least* key, but also anything else that is convenient. 
+  // include lexical bounds for which this is a complete result.
+  //  (if *start and *end are empty, it's the entire table)
+  void prefetch(inode_t& inode, bufferptr& key, 
+		list<bufferptr>* pkeys, list<bufferlist>* pvals, 
+		bufferptr *start, bufferptr *end,
+		Context *onfinish);
+
+  
+};
+
+#endif
diff --git a/branches/sage/cephmds2/osdc/Filer.cc b/branches/sage/cephmds2/osdc/Filer.cc
new file mode 100644
index 0000000000000..47094a3056836
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Filer.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <assert.h>
+
+#include "Filer.h"
+#include "osd/OSDMap.h"
+
+//#include "messages/MOSDRead.h"
+//#include "messages/MOSDReadReply.h"
+//#include "messages/MOSDWrite.h"
+//#include "messages/MOSDWriteReply.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+
+#include "msg/Messenger.h"
+
+#include "include/Context.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".filer "
+
+
+class Filer::C_Probe : public Context {
+public:
+  Filer *filer;
+  Probe *probe;
+  object_t oid;
+  off_t size;
+  C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {}
+  void finish(int r) {
+    filer->_probed(probe, oid, size);    
+  }  
+};
+
+int Filer::probe_fwd(inode_t& inode,
+		     off_t start_from,
+		     off_t *end,
+		     Context *onfinish) 
+{
+  dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl;
+
+  Probe *probe = new Probe(inode, start_from, end, onfinish);
+
+  // period (bytes before we jump unto a new set of object(s))
+  off_t period = inode.layout.period();
+
+  // start with 1+ periods.
+  probe->probing_len = period;
+  if (start_from % period) 
+    probe->probing_len += period - (start_from % period);
+  
+  _probe(probe);
+  return 0;
+}
+
+void Filer::_probe(Probe *probe)
+{
+  dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl;
+  
+  // map range onto objects
+  file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing);
+  
+  for (list<ObjectExtent>::iterator p = probe->probing.begin();
+       p != probe->probing.end();
+       p++) {
+    dout(10) << "_probe  probing " << p->oid << endl;
+    C_Probe *c = new C_Probe(this, probe, p->oid);
+    probe->ops[p->oid] = objecter->stat(p->oid, &c->size, c);
+  }
+}
+
+void Filer::_probed(Probe *probe, object_t oid, off_t size)
+{
+  dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl;
+
+  probe->known[oid] = size;
+  assert(probe->ops.count(oid));
+  probe->ops.erase(oid);
+
+  if (!probe->ops.empty()) 
+    return;  // waiting for more!
+
+  // analyze!
+  off_t end = 0;
+  for (list<ObjectExtent>::iterator p = probe->probing.begin();
+       p != probe->probing.end();
+       p++) {
+    off_t shouldbe = p->length+p->start;
+    dout(10) << "_probed  " << probe->inode.ino << " object " << hex << p->oid << dec
+	     << " should be " << shouldbe
+	     << ", actual is " << probe->known[p->oid]
+	     << endl;
+
+    if (probe->known[p->oid] < 0) { end = -1; break; } // error!
+
+    assert(probe->known[p->oid] <= shouldbe);
+    if (shouldbe == probe->known[p->oid]) continue;  // keep going
+   
+    // aha, we found the end!
+    // calc offset into buffer_extent to get distance from probe->from.
+    off_t oleft = probe->known[p->oid] - p->start;
+    for (map<size_t,size_t>::iterator i = p->buffer_extents.begin();
+	 i != p->buffer_extents.end();
+	 i++) {
+      if (oleft <= (off_t)i->second) {
+	end = probe->from + i->first + oleft;
+	dout(10) << "_probed  end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft 
+		 << ", from was " << probe->from << ", end is " << end 
+		 << endl;
+	break;
+      }
+      oleft -= i->second;
+    }
+    break;
+  }
+
+  if (end == 0) {
+    // keep probing!
+    dout(10) << "_probed didn't find end, probing further" << endl;
+    off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count;
+    probe->from += probe->probing_len;
+    probe->probing_len = period;
+    _probe(probe);
+    return;
+  }
+
+  if (end < 0) {
+    dout(10) << "_probed encountered an error while probing" << endl;
+    *probe->end = -1;
+  } else {
+    // hooray!
+    dout(10) << "_probed found end at " << end << endl;
+    *probe->end = end;
+  }
+
+  // done!  finish and clean up.
+  probe->onfinish->finish(end > 0 ? 0:-1);
+  delete probe->onfinish;
+  delete probe;
+}
+
+
+void Filer::file_to_extents(inode_t inode,
+                            off_t offset, size_t len,
+                            list<ObjectExtent>& extents,
+			    objectrev_t rev) 
+{
+  dout(10) << "file_to_extents " << offset << "~" << len 
+           << " on " << hex << inode.ino << dec
+           << endl;
+
+  /* we want only one extent per object!
+   * this means that each extent we read may map into different bits of the 
+   * final read buffer.. hence OSDExtent.buffer_extents
+   */
+  map< object_t, ObjectExtent > object_extents;
+  
+  assert(inode.layout.object_size >= inode.layout.stripe_size);
+  off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size;
+  dout(20) << " stripes_per_object " << stripes_per_object << endl;
+
+  off_t cur = offset;
+  off_t left = len;
+  while (left > 0) {
+    // layout into objects
+    off_t blockno = cur / inode.layout.stripe_size;
+    off_t stripeno = blockno / inode.layout.stripe_count;
+    off_t stripepos = blockno % inode.layout.stripe_count;
+    off_t objectsetno = stripeno / stripes_per_object;
+    off_t objectno = objectsetno * inode.layout.stripe_count + stripepos;
+    
+    // find oid, extent
+    ObjectExtent *ex = 0;
+    object_t oid( inode.ino, objectno );
+    if (object_extents.count(oid)) 
+      ex = &object_extents[oid];
+    else {
+      ex = &object_extents[oid];
+      ex->oid = oid;
+      ex->rev = rev;
+      ex->pgid = objecter->osdmap->object_to_pg( oid, inode.layout );
+    }
+    
+    // map range into object
+    off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_size;
+    off_t block_off = cur % inode.layout.stripe_size;
+    off_t max = inode.layout.stripe_size - block_off;
+    
+    off_t x_offset = block_start + block_off;
+    off_t x_len;
+    if (left > max)
+      x_len = max;
+    else
+      x_len = left;
+    
+    if (ex->start + (off_t)ex->length == x_offset) {
+      // add to extent
+      ex->length += x_len;
+    } else {
+      // new extent
+      assert(ex->length == 0);
+      assert(ex->start == 0);
+      ex->start = x_offset;
+      ex->length = x_len;
+    }
+    ex->buffer_extents[cur-offset] = x_len;
+        
+    dout(15) << "file_to_extents  " << *ex << " in " << ex->pgid << endl;
+    //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl;
+    
+    left -= x_len;
+    cur += x_len;
+  }
+  
+  // make final list
+  for (map<object_t, ObjectExtent>::iterator it = object_extents.begin();
+       it != object_extents.end();
+       it++) {
+    extents.push_back(it->second);
+  }
+}
diff --git a/branches/sage/cephmds2/osdc/Filer.h b/branches/sage/cephmds2/osdc/Filer.h
new file mode 100644
index 0000000000000..161bfec304531
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Filer.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef __FILER_H
+#define __FILER_H
+
+/*** Filer
+ *
+ * stripe file ranges onto objects.
+ * build list<ObjectExtent> for the objecter or objectcacher.
+ *
+ * also, provide convenience methods that call objecter for you.
+ *
+ * "files" are identified by ino. 
+ */
+
+#include <set>
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "include/types.h"
+
+#include "osd/OSDMap.h"
+#include "Objecter.h"
+
+class Context;
+class Messenger;
+class OSDMap;
+
+
+/**** Filer interface ***/
+
+class Filer {
+  Objecter   *objecter;
+  
+  // probes
+  struct Probe {
+    inode_t inode;
+    off_t from;
+    off_t *end;
+    Context *onfinish;
+    
+    list<ObjectExtent> probing;
+    off_t probing_len;
+    
+    map<object_t, off_t> known;
+    map<object_t, tid_t> ops;
+
+    Probe(inode_t &i, off_t f, off_t *e, Context *c) : 
+      inode(i), from(f), end(e), onfinish(c), probing_len(0) {}
+  };
+  
+  class C_Probe;
+  //friend class C_Probe;  
+
+  void _probe(Probe *p);
+  void _probed(Probe *p, object_t oid, off_t size);
+
+ public:
+  Filer(Objecter *o) : objecter(o) {}
+  ~Filer() {}
+
+  bool is_active() {
+    return objecter->is_active(); // || (oc && oc->is_active());
+  }
+
+  /*** async file interface ***/
+  int read(inode_t& inode,
+           off_t offset, 
+           size_t len, 
+           bufferlist *bl,   // ptr to data
+           Context *onfinish) {
+    Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+    file_to_extents(inode, offset, len, rd->extents);
+    return objecter->readx(rd, onfinish) > 0 ? 0:-1;
+  }
+
+  int write(inode_t& inode,
+            off_t offset, 
+            size_t len, 
+            bufferlist& bl,
+            int flags, 
+            Context *onack,
+            Context *oncommit,
+	    objectrev_t rev=0) {
+    Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+    file_to_extents(inode, offset, len, wr->extents, rev);
+    return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1;
+  }
+
+  int zero(inode_t& inode,
+           off_t offset,
+           size_t len,
+           Context *onack,
+           Context *oncommit) {
+    Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO);
+    file_to_extents(inode, offset, len, z->extents);
+    return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1;
+  }
+
+  int remove(inode_t& inode,
+	     off_t offset,
+	     size_t len,
+	     Context *onack,
+	     Context *oncommit) {
+    Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE);
+    file_to_extents(inode, offset, len, z->extents);
+    return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1;
+  }
+
+  int probe_fwd(inode_t& inode,
+		off_t start_from,
+		off_t *end,
+		Context *onfinish);
+
+
+  /***** mapping *****/
+
+  /* map (ino, ono) to an object name
+     (to be used on any osd in the proper replica group) */
+  /*object_t file_to_object(inodeno_t ino,
+                          size_t    _ono) {  
+    __uint64_t ono = _ono;
+    assert(ino < (1ULL<<OID_INO_BITS));       // legal ino can't be too big
+    assert(ono < (1ULL<<OID_ONO_BITS));
+    return ono + (ino << OID_ONO_BITS);
+  }
+  */
+
+
+  /* map (ino, offset, len) to a (list of) OSDExtents 
+     (byte ranges in objects on (primary) osds) */
+  void file_to_extents(inode_t inode,
+                       off_t offset,
+                       size_t len,
+                       list<ObjectExtent>& extents,
+		       objectrev_t rev=0);
+  
+};
+
+
+
+#endif
diff --git a/branches/sage/cephmds2/osdc/Journaler.cc b/branches/sage/cephmds2/osdc/Journaler.cc
new file mode 100644
index 0000000000000..1bee1542bf906
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Journaler.cc
@@ -0,0 +1,601 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "Journaler.h"
+
+#include "include/Context.h"
+#include "common/Logger.h"
+#include "msg/Messenger.h"
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
+#define derr(x)  if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
+
+
+
+void Journaler::reset()
+{
+  dout(1) << "reset to blank journal" << endl;
+  state = STATE_ACTIVE;
+  write_pos = flush_pos = ack_pos =
+    read_pos = requested_pos = received_pos =
+    expire_pos = trimming_pos = trimmed_pos = inode.layout.period();
+}
+
+
+/***************** HEADER *******************/
+
+ostream& operator<<(ostream& out, Journaler::Header &h) 
+{
+  return out << "loghead(trim " << h.trimmed_pos
+	     << ", expire " << h.expire_pos
+	     << ", read " << h.read_pos
+	     << ", write " << h.write_pos
+	     << ")";
+}
+
+class Journaler::C_ReadHead : public Context {
+  Journaler *ls;
+public:
+  bufferlist bl;
+  C_ReadHead(Journaler *l) : ls(l) {}
+  void finish(int r) {
+    ls->_finish_read_head(r, bl);
+  }
+};
+
+class Journaler::C_ProbeEnd : public Context {
+  Journaler *ls;
+public:
+  off_t end;
+  C_ProbeEnd(Journaler *l) : ls(l), end(-1) {}
+  void finish(int r) {
+    ls->_finish_probe_end(r, end);
+  }
+};
+
+void Journaler::recover(Context *onread) 
+{
+  assert(state != STATE_ACTIVE);
+
+  if (onread)
+    waitfor_recover.push_back(onread);
+  
+  if (state != STATE_UNDEF) {
+    dout(1) << "recover - already recoverying" << endl;
+    return;
+  }
+
+  dout(1) << "read_head" << endl;
+  state = STATE_READHEAD;
+  C_ReadHead *fin = new C_ReadHead(this);
+  filer.read(inode, 0, sizeof(Header), &fin->bl, fin);
+}
+
+void Journaler::_finish_read_head(int r, bufferlist& bl)
+{
+  assert(state == STATE_READHEAD);
+
+  if (bl.length() == 0) {
+    dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl;    
+    state = STATE_ACTIVE;
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(ls, 0);
+    return;
+  } 
+
+  // unpack header
+  Header h;
+  assert(bl.length() == sizeof(h));
+  bl.copy(0, sizeof(h), (char*)&h);
+  dout(1) << "_finish_read_head " << h << ".  probing for end of log (from " << write_pos << ")..." << endl;
+
+  write_pos = flush_pos = ack_pos = h.write_pos;
+  read_pos = requested_pos = received_pos = h.read_pos;
+  expire_pos = h.expire_pos;
+  trimmed_pos = trimming_pos = h.trimmed_pos;
+
+  // probe the log
+  state = STATE_PROBING;
+  C_ProbeEnd *fin = new C_ProbeEnd(this);
+  filer.probe_fwd(inode, h.write_pos, &fin->end, fin);
+}
+
+void Journaler::_finish_probe_end(int r, off_t end)
+{
+  assert(r >= 0);
+  assert(end >= write_pos);
+  assert(state == STATE_PROBING);
+
+  dout(1) << "_finish_probe_end write_pos = " << end 
+	  << " (header had " << write_pos << "). recovered."
+	  << endl;
+  
+  write_pos = flush_pos = ack_pos = end;
+  
+  // done.
+  list<Context*> ls;
+  ls.swap(waitfor_recover);
+  finish_contexts(ls, 0);
+}
+
+
+// WRITING
+
+class Journaler::C_WriteHead : public Context {
+public:
+  Journaler *ls;
+  Header h;
+  Context *oncommit;
+  C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {}
+  void finish(int r) {
+    ls->_finish_write_head(h, oncommit);
+  }
+};
+
+void Journaler::write_head(Context *oncommit)
+{
+  assert(state == STATE_ACTIVE);
+  last_written.trimmed_pos = trimmed_pos;
+  last_written.expire_pos = expire_pos;
+  last_written.read_pos = read_pos;
+  last_written.write_pos = ack_pos; //write_pos;
+  dout(10) << "write_head " << last_written << endl;
+  
+  last_wrote_head = g_clock.now();
+
+  bufferlist bl;
+  bl.append((char*)&last_written, sizeof(last_written));
+  filer.write(inode, 0, bl.length(), bl, 0, 
+	      0, new C_WriteHead(this, last_written, oncommit));
+}
+
+void Journaler::_finish_write_head(Header &wrote, Context *oncommit)
+{
+  dout(10) << "_finish_write_head " << wrote << endl;
+  last_committed = wrote;
+  if (oncommit) {
+    oncommit->finish(0);
+    delete oncommit;
+  }
+
+  trim();  // trim?
+}
+
+
+/***************** WRITING *******************/
+
+class Journaler::C_Flush : public Context {
+  Journaler *ls;
+  off_t start;
+public:
+  C_Flush(Journaler *l, off_t s) : ls(l), start(s) {}
+  void finish(int r) { ls->_finish_flush(r, start); }
+};
+
+void Journaler::_finish_flush(int r, off_t start)
+{
+  assert(r>=0);
+
+  assert(start >= ack_pos);
+  assert(start < flush_pos);
+  assert(pending_flush.count(start));
+
+  // calc latency?
+  if (logger) {
+    utime_t lat = g_clock.now();
+    lat -= pending_flush[start];
+    logger->finc("lsum", lat);
+    logger->inc("lnum");
+  }
+
+  pending_flush.erase(start);
+
+  // adjust ack_pos
+  if (pending_flush.empty())
+    ack_pos = flush_pos;
+  else
+    ack_pos = pending_flush.begin()->first;
+
+  dout(10) << "_finish_flush from " << start
+	   << ", pending_flush now " << pending_flush 
+	   << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos
+	   << endl;
+
+  // kick waiters <= ack_pos
+  while (!waitfor_flush.empty()) {
+    if (waitfor_flush.begin()->first > ack_pos) break;
+    finish_contexts(waitfor_flush.begin()->second);
+    waitfor_flush.erase(waitfor_flush.begin());
+  }
+}
+
+
+off_t Journaler::append_entry(bufferlist& bl, Context *onsync)
+{
+  size_t s = bl.length();
+
+  if (!g_conf.journaler_allow_split_entries) {
+    // will we span a stripe boundary?
+    int p = inode.layout.stripe_size;
+    if (write_pos / p != (write_pos + bl.length() + sizeof(s)) / p) {
+      // yes.
+      // move write_pos forward.
+      off_t owp = write_pos;
+      write_pos += p;
+      write_pos -= (write_pos % p);
+      
+      // pad with zeros.
+      bufferptr bp(write_pos - owp);
+      bp.zero();
+      assert(bp.length() >= 4);
+      write_buf.push_back(bp);
+      
+      // now flush.
+      flush();
+      
+      dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl;
+    }
+  }
+	
+  dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl;
+  
+  // append
+  write_buf.append((char*)&s, sizeof(s));
+  write_buf.append(bl);
+  write_pos += sizeof(s) + s;
+
+  // flush now?
+  if (onsync) 
+    flush(onsync);
+
+  return write_pos;
+}
+
+
+void Journaler::flush(Context *onsync)
+{
+  if (write_pos == flush_pos) {
+    assert(write_buf.length() == 0);
+    dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl;
+
+    if (onsync) {
+      onsync->finish(0);
+      delete onsync;
+    }
+    return;
+  }
+
+  unsigned len = write_pos - flush_pos;
+  assert(len == write_buf.length());
+  dout(10) << "flush flushing " << flush_pos << "~" << len << endl;
+
+  // submit write for anything pending
+  filer.write(inode, flush_pos, len, write_buf, 0,
+	      new C_Flush(this, flush_pos), 0);  // flush _start_ pos to _finish_flush
+  pending_flush[flush_pos] = g_clock.now();
+  
+  // adjust pointers
+  flush_pos = write_pos;
+  write_buf.clear();  
+
+  dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl;
+
+  // queue waiter (at _new_ write_pos; will go when reached by ack_pos)
+  if (onsync) 
+    waitfor_flush[write_pos].push_back(onsync);
+
+  // write head?
+  if (last_wrote_head.sec() + 30 < g_clock.now().sec()) {
+    write_head();
+  }
+}
+
+
+
+/***************** READING *******************/
+
+
+class Journaler::C_Read : public Context {
+  Journaler *ls;
+public:
+  C_Read(Journaler *l) : ls(l) {}
+  void finish(int r) { ls->_finish_read(r); }
+};
+
+class Journaler::C_RetryRead : public Context {
+  Journaler *ls;
+public:
+  C_RetryRead(Journaler *l) : ls(l) {}
+  void finish(int r) { ls->is_readable(); }  // this'll kickstart.
+};
+
+void Journaler::_finish_read(int r)
+{
+  assert(r>=0);
+
+  dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl;
+  received_pos += reading_buf.length();
+  read_buf.claim_append(reading_buf);
+  assert(received_pos <= requested_pos);
+  dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() 
+	   << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos
+	   << endl;
+  
+  if (is_readable()) { // NOTE: this check may read more
+    // readable!
+    dout(10) << "_finish_read now readable" << endl;
+    if (on_readable) {
+      Context *f = on_readable;
+      on_readable = 0;
+      f->finish(0);
+      delete f;
+    }
+
+    if (read_bl) {
+      bool r = try_read_entry(*read_bl);
+      assert(r);  // this should have worked.
+
+      // clear state
+      Context *f = on_read_finish;
+      on_read_finish = 0;
+      read_bl = 0;
+      
+      // do callback
+      f->finish(0);
+      delete f;
+    }
+  }
+  
+  // prefetch?
+  _prefetch();
+}
+
+/* NOTE: this could be slightly smarter... we could allow
+ * multiple reads to be in progress.  e.g., if we prefetch, but
+ * then discover we need even more for an especially large entry.
+ * i don't think that circumstance will arise particularly often.
+ */
+void Journaler::_issue_read(off_t len)
+{
+  if (_is_reading()) {
+    dout(10) << "_issue_read " << len << " waiting, already reading " 
+	     << received_pos << "~" << (requested_pos-received_pos) << endl;
+    return;
+  } 
+  assert(requested_pos == received_pos);
+
+  // stuck at ack_pos?
+  assert(requested_pos <= ack_pos);
+  if (requested_pos == ack_pos) {
+    dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl;
+    assert(write_pos > requested_pos);
+    if (flush_pos == ack_pos)
+      flush();
+    assert(flush_pos > ack_pos);
+    waitfor_flush[flush_pos].push_back(new C_RetryRead(this));
+    return;
+  }
+
+  // don't read too much
+  if (requested_pos + len > ack_pos) {
+    len = ack_pos - requested_pos;
+    dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl;
+  }
+
+  // go.
+  dout(10) << "_issue_read reading " << requested_pos << "~" << len 
+	   << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len)
+	   << endl;
+  
+  filer.read(inode, requested_pos, len, &reading_buf, 
+	     new C_Read(this));
+  requested_pos += len;
+}
+
+void Journaler::_prefetch()
+{
+  // prefetch?
+  off_t left = requested_pos - read_pos;
+  if (left <= prefetch_from &&      // should read more,
+      !_is_reading() &&             // and not reading anything right now
+      write_pos > requested_pos) {  // there's something more to read...
+    dout(10) << "_prefetch only " << left << " < " << prefetch_from
+	     << ", prefetching " << endl;
+    _issue_read(fetch_len);
+  }
+}
+
+
+void Journaler::read_entry(bufferlist *bl, Context *onfinish)
+{
+  // only one read at a time!
+  assert(read_bl == 0);
+  assert(on_read_finish == 0);
+  
+  if (is_readable()) {
+    dout(10) << "read_entry at " << read_pos << ", read_buf is " 
+	     << read_pos << "~" << read_buf.length() 
+	     << ", readable now" << endl;
+
+    // nice, just do it now.
+    bool r = try_read_entry(*bl);
+    assert(r);
+    
+    // callback
+    onfinish->finish(0);
+    delete onfinish;    
+  } else {
+    dout(10) << "read_entry at " << read_pos << ", read_buf is " 
+	     << read_pos << "~" << read_buf.length() 
+	     << ", not readable now" << endl;
+
+    bl->clear();
+
+    // set it up
+    read_bl = bl;
+    on_read_finish = onfinish;
+
+    // is_readable() will have already initiated a read (if it was possible)
+  }
+}
+
+
+/* is_readable()
+ *  return true if next entry is ready.
+ *  kickstart read as necessary.
+ */
+bool Journaler::is_readable() 
+{
+  // anything to read?
+  if (read_pos == write_pos) return false;
+
+  // have enough for entry size?
+  size_t s = 0;
+  if (read_buf.length() >= sizeof(s)) 
+    read_buf.copy(0, sizeof(s), (char*)&s);
+
+  // entry and payload?
+  if (read_buf.length() >= sizeof(s) &&
+      read_buf.length() >= sizeof(s) + s) 
+    return true;  // yep, next entry is ready.
+
+  // darn it!
+
+  // partial fragment at the end?
+  if (received_pos == write_pos) {
+    dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl;
+    write_pos = flush_pos = ack_pos = read_pos;
+    assert(write_buf.length() == 0);
+
+    // truncate?
+    // FIXME: how much?
+    
+    return false;
+  } 
+
+  // start reading some more?
+  if (!_is_reading()) {
+    if (s)
+      fetch_len = MAX(fetch_len, sizeof(s)+s-read_buf.length()); 
+    _issue_read(fetch_len);
+  }
+
+  return false;
+}
+
+
+/* try_read_entry(bl)
+ *  read entry into bl if it's ready.
+ *  otherwise, do nothing.  (well, we'll start fetching it for good measure.)
+ */
+bool Journaler::try_read_entry(bufferlist& bl)
+{
+  if (!is_readable()) {  // this may start a read. 
+    dout(10) << "try_read_entry at " << read_pos << " not readable" << endl;
+    return false;
+  }
+  
+  size_t s;
+  assert(read_buf.length() >= sizeof(s));
+  read_buf.copy(0, sizeof(s), (char*)&s);
+  assert(read_buf.length() >= sizeof(s) + s);
+  
+  dout(10) << "try_read_entry at " << read_pos << " reading " 
+	   << read_pos << "~" << (sizeof(s)+s) << endl;
+
+  // do it
+  assert(bl.length() == 0);
+  read_buf.splice(0, sizeof(s));
+  read_buf.splice(0, s, &bl);
+  read_pos += sizeof(s) + s;
+
+  // prefetch?
+  _prefetch();
+  return true;
+}
+
+void Journaler::wait_for_readable(Context *onreadable)
+{
+  dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl;
+  assert(!is_readable());
+  assert(on_readable == 0);
+  on_readable = onreadable;
+}
+
+
+
+
+/***************** TRIMMING *******************/
+
+
+class Journaler::C_Trim : public Context {
+  Journaler *ls;
+  off_t to;
+public:
+  C_Trim(Journaler *l, off_t t) : ls(l), to(t) {}
+  void finish(int r) {
+    ls->_trim_finish(r, to);
+  }
+};
+
+void Journaler::trim()
+{
+  off_t trim_to = last_committed.expire_pos;
+  trim_to -= trim_to % inode.layout.period();
+  dout(10) << "trim last_commited head was " << last_committed
+	   << ", can trim to " << trim_to
+	   << endl;
+  if (trim_to == 0 || trim_to == trimming_pos) {
+    dout(10) << "trim already trimmed/trimming to " 
+	     << trimmed_pos << "/" << trimming_pos << endl;
+    return;
+  }
+  
+  // trim
+  assert(trim_to <= write_pos);
+  assert(trim_to > trimming_pos);
+  dout(10) << "trim trimming to " << trim_to 
+	   << ", trimmed/trimming/expire are " 
+	   << trimmed_pos << "/" << trimming_pos << "/" << expire_pos
+	   << endl;
+  
+  filer.remove(inode, trimming_pos, trim_to-trimming_pos, 
+	       0, new C_Trim(this, trim_to));
+  trimming_pos = trim_to;  
+}
+
+void Journaler::_trim_finish(int r, off_t to)
+{
+  dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos
+	   << ", trimmed/trimming/expire now "
+	   << to << "/" << trimming_pos << "/" << expire_pos
+	   << endl;
+  assert(r >= 0);
+  
+  assert(to <= trimming_pos);
+  assert(to > trimmed_pos);
+  trimmed_pos = to;
+
+  // finishers?
+  while (!waitfor_trim.empty() &&
+	 waitfor_trim.begin()->first <= trimmed_pos) {
+    finish_contexts(waitfor_trim.begin()->second, 0);
+    waitfor_trim.erase(waitfor_trim.begin());
+  }
+}
+
+
+// eof.
diff --git a/branches/sage/cephmds2/osdc/Journaler.h b/branches/sage/cephmds2/osdc/Journaler.h
new file mode 100644
index 0000000000000..0b8d7061330e8
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Journaler.h
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+/* Journaler
+ *
+ * This class stripes a serial log over objects on the store.  Four logical pointers:
+ *
+ *  write_pos - where we're writing new entries
+ *   read_pos - where we're reading old entires
+ * expire_pos - what is deemed "old" by user
+ *   trimmed_pos - where we're expiring old items
+ *
+ *  trimmed_pos <= expire_pos <= read_pos <= write_pos.
+ *
+ * Often, read_pos <= write_pos (as with MDS log).  During recovery, write_pos is undefined
+ * until the end of the log is discovered.
+ *
+ * A "head" struct at the beginning of the log is used to store metadata at
+ * regular intervals.  The basic invariants include:
+ *
+ *   head.read_pos   <= read_pos   -- the head may "lag", since it's updated lazily.
+ *   head.write_pos  <= write_pos
+ *   head.expire_pos <= expire_pos
+ *   head.trimmed_pos   <= trimmed_pos
+ *
+ * More significantly,
+ *
+ *   head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log
+ *                                  as last recorded, before it is trimmed.  trimming will
+ *                                  block until a sufficiently current expire_pos is committed.
+ *
+ * To recover log state, we simply start at the last write_pos in the head, and probe the
+ * object sequence sizes until we read the end.  
+ *
+ * Head struct is stored in the first object.  Actual journal starts after layout.period() bytes.
+ *
+ */
+
+#ifndef __JOURNALER_H
+#define __JOURNALER_H
+
+#include "Objecter.h"
+#include "Filer.h"
+
+#include <list>
+#include <map>
+
+class Context;
+class Logger;
+
+class Journaler {
+
+  // this goes at the head of the log "file".
+  struct Header {
+    off_t trimmed_pos;
+    off_t expire_pos;
+    off_t read_pos;
+    off_t write_pos;
+    Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {}
+  } last_written, last_committed;
+
+  friend ostream& operator<<(ostream& out, Header &h);
+
+
+  // me
+  inode_t inode;
+  Objecter *objecter;
+  Filer filer;
+
+  Logger *logger;
+
+  // my state
+  static const int STATE_UNDEF = 0;
+  static const int STATE_READHEAD = 1;
+  static const int STATE_PROBING = 2;
+  static const int STATE_ACTIVE = 2;
+
+  int state;
+
+  // header
+  utime_t last_wrote_head;
+  void _finish_write_head(Header &wrote, Context *oncommit);
+  class C_WriteHead;
+  friend class C_WriteHead;
+
+  list<Context*> waitfor_recover;
+  void _finish_read_head(int r, bufferlist& bl);
+  void _finish_probe_end(int r, off_t end);
+  class C_ReadHead;
+  friend class C_ReadHead;
+  class C_ProbeEnd;
+  friend class C_ProbeEnd;
+
+
+
+  // writer
+  off_t write_pos;       // logical write position, where next entry will go
+  off_t flush_pos;       // where we will flush. if write_pos>flush_pos, we're buffering writes.
+  off_t ack_pos;         // what has been acked.
+  bufferlist write_buf;  // write buffer.  flush_pos + write_buf.length() == write_pos.
+
+  std::map<off_t, utime_t> pending_flush;  // start offsets and times for pending flushes
+  std::map<off_t, std::list<Context*> > waitfor_flush; // when flushed through given offset
+
+  void _finish_flush(int r, off_t start);
+  class C_Flush;
+  friend class C_Flush;
+
+  // reader
+  off_t read_pos;      // logical read position, where next entry starts.
+  off_t requested_pos; // what we've requested from OSD.
+  off_t received_pos;  // what we've received from OSD.
+  bufferlist read_buf; // read buffer.  read_pos + read_buf.length() == prefetch_pos.
+  bufferlist reading_buf; // what i'm reading into
+
+  off_t fetch_len;     // how much to read at a time
+  off_t prefetch_from; // how far from end do we read next chunk
+
+  // for read_entry() in-progress read
+  bufferlist *read_bl;
+  Context    *on_read_finish;
+  // for wait_for_readable()
+  Context    *on_readable;
+
+  bool _is_reading() {
+    return requested_pos > received_pos;
+  }
+  void _finish_read(int r);     // we just read some (read completion callback)
+  void _issue_read(off_t len);  // read some more
+  void _prefetch();             // maybe read ahead
+  class C_Read;
+  friend class C_Read;
+  class C_RetryRead;
+  friend class C_RetryRead;
+
+  // trimmer
+  off_t expire_pos;    // what we're allowed to trim to
+  off_t trimming_pos;      // what we've requested to trim through
+  off_t trimmed_pos;   // what has been trimmed
+  map<off_t, list<Context*> > waitfor_trim;
+
+  void _trim_finish(int r, off_t to);
+  class C_Trim;
+  friend class C_Trim;
+
+public:
+  Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) : 
+    inode(inode_), objecter(obj), filer(objecter), logger(l),
+    state(STATE_UNDEF),
+    write_pos(0), flush_pos(0), ack_pos(0),
+    read_pos(0), requested_pos(0), received_pos(0),
+    fetch_len(fl), prefetch_from(pff),
+    read_bl(0), on_read_finish(0), on_readable(0),
+    expire_pos(0), trimming_pos(0), trimmed_pos(0) 
+  {
+    // prefetch intelligently.
+    // (watch out, this is big if you use big objects or weird striping)
+    if (!fetch_len)
+      fetch_len = inode.layout.object_size*inode.layout.stripe_count;      
+    if (!prefetch_from)
+      prefetch_from = fetch_len / 2;
+  }
+
+  // me
+  //void open(Context *onopen);
+  //void claim(Context *onclaim, msg_addr_t from);
+
+  /* reset 
+   *  NOTE: we assume the caller knows/has ensured that any objects 
+   * in our sequence do not exist.. e.g. after a MKFS.  this is _not_
+   * an "erase" method.
+   */
+  void reset();
+  void recover(Context *onfinish);
+  void write_head(Context *onsave=0);
+
+  bool is_active() { return state == STATE_ACTIVE; }
+
+  off_t get_write_pos() const { return write_pos; }
+  off_t get_read_pos() const { return read_pos; }
+  off_t get_expire_pos() const { return expire_pos; }
+  off_t get_trimmed_pos() const { return trimmed_pos; }
+
+  // write
+  off_t append_entry(bufferlist& bl, Context *onsync = 0);
+  void flush(Context *onsync = 0);
+
+  // read
+  void set_read_pos(off_t p) { 
+    assert(requested_pos == received_pos);  // we can't cope w/ in-progress read right now.
+    assert(read_bl == 0); // ...
+    read_pos = requested_pos = received_pos = p;
+    read_buf.clear();
+  }
+  bool is_readable();
+  bool try_read_entry(bufferlist& bl);
+  void wait_for_readable(Context *onfinish);
+  void read_entry(bufferlist* bl, Context *onfinish);
+  
+  // trim
+  void set_expire_pos(off_t ep) { expire_pos = ep; }
+  void trim();
+  //bool is_trimmable() { return trimming_pos < expire_pos; }
+  //void trim(off_t trim_to=0, Context *c=0);
+};
+
+
+#endif
diff --git a/branches/sage/cephmds2/osdc/ObjectCacher.cc b/branches/sage/cephmds2/osdc/ObjectCacher.cc
new file mode 100644
index 0000000000000..e2520f595096d
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/ObjectCacher.cc
@@ -0,0 +1,1472 @@
+
+#include "msg/Messenger.h"
+#include "ObjectCacher.h"
+#include "Objecter.h"
+
+
+
+/*** ObjectCacher::BufferHead ***/
+
+
+/*** ObjectCacher::Object ***/
+
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << oc->objecter->messenger->get_myaddr() << ".objectcacher.object(" << oid << ") "
+
+
+ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off)
+{
+  dout(20) << "split " << *bh << " at " << off << endl;
+  
+  // split off right
+  ObjectCacher::BufferHead *right = new BufferHead(this);
+  right->last_write_tid = bh->last_write_tid;
+  right->set_state(bh->get_state());
+  
+  off_t newleftlen = off - bh->start();
+  right->set_start( off );
+  right->set_length( bh->length() - newleftlen );
+  
+  // shorten left
+  oc->bh_stat_sub(bh);
+  bh->set_length( newleftlen );
+  oc->bh_stat_add(bh);
+  
+  // add right
+  oc->bh_add(this, right);
+  
+  // split buffers too
+  bufferlist bl;
+  bl.claim(bh->bl);
+  if (bl.length()) {
+    assert(bl.length() == (bh->length() + right->length()));
+    right->bl.substr_of(bl, bh->length(), right->length());
+    bh->bl.substr_of(bl, 0, bh->length());
+  }
+  
+  // move read waiters
+  if (!bh->waitfor_read.empty()) {
+    map<off_t, list<Context*> >::iterator o, p = bh->waitfor_read.end();
+    p--;
+    while (p != bh->waitfor_read.begin()) {
+      if (p->first < right->start()) break;      
+      dout(0) << "split  moving waiters at byte " << p->first << " to right bh" << endl;
+      right->waitfor_read[p->first].swap( p->second );
+      o = p;
+      p--;
+      bh->waitfor_read.erase(o);
+    }
+  }
+  
+  dout(20) << "split    left is " << *bh << endl;
+  dout(20) << "split   right is " << *right << endl;
+  return right;
+}
+
+
+void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
+{
+  assert(left->end() == right->start());
+  assert(left->get_state() == right->get_state());
+
+  dout(10) << "merge_left " << *left << " + " << *right << endl;
+  oc->bh_remove(this, right);
+  oc->bh_stat_sub(left);
+  left->set_length( left->length() + right->length());
+  oc->bh_stat_add(left);
+
+  // data
+  left->bl.claim_append(right->bl);
+  
+  // version 
+  // note: this is sorta busted, but should only be used for dirty buffers
+  left->last_write_tid =  MAX( left->last_write_tid, right->last_write_tid );
+  left->last_write = MAX( left->last_write, right->last_write );
+
+  // waiters
+  for (map<off_t, list<Context*> >::iterator p = right->waitfor_read.begin();
+       p != right->waitfor_read.end();
+       p++) 
+    left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(),
+                                         p->second );
+  
+  // hose right
+  delete right;
+
+  dout(10) << "merge_left result " << *left << endl;
+}
+
+/* buggy possibly, but more importnatly, unnecessary.
+void ObjectCacher::Object::merge_right(BufferHead *left, BufferHead *right)
+{
+  assert(left->end() == right->start());
+  assert(left->get_state() == right->get_state());
+
+  dout(10) << "merge_right " << *left << " + " << *right << endl;
+  oc->bh_remove(this, left);
+  oc->bh_stat_sub(right);
+  data.erase(right->start());
+  right->set_start( left->start() );
+  data[right->start()] = right;
+  right->set_length( left->length() + right->length());
+  oc->bh_stat_add(right);
+
+  // data
+  bufferlist nbl;
+  nbl.claim(left->bl);
+  nbl.claim_append(right->bl);
+  right->bl.claim(nbl);
+  
+  // version 
+  // note: this is sorta busted, but should only be used for dirty buffers
+  right->last_write_tid =  MAX( left->last_write_tid, right->last_write_tid );
+
+  // waiters
+  map<off_t,list<Context*> > old;
+  old.swap(right->waitfor_read);
+
+  // take left's waiters
+  right->waitfor_read.swap(left->waitfor_read);
+
+  // shift old waiters
+  for (map<off_t, list<Context*> >::iterator p = old.begin();
+       p != old.end();
+       p++) 
+    right->waitfor_read[p->first + left->length()].swap( p->second );
+  
+  // hose left
+  delete left;
+
+  dout(10) << "merge_right result " << *right << endl;
+}
+*/
+
+/*
+ * map a range of bytes into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ */
+int ObjectCacher::Object::map_read(Objecter::OSDRead *rd,
+                                   map<off_t, BufferHead*>& hits,
+                                   map<off_t, BufferHead*>& missing,
+                                   map<off_t, BufferHead*>& rx)
+{
+  for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+       ex_it != rd->extents.end();
+       ex_it++) {
+    
+    if (ex_it->oid != oid) continue;
+    
+    dout(10) << "map_read " << ex_it->oid 
+             << " " << ex_it->start << "~" << ex_it->length << endl;
+    
+    map<off_t, BufferHead*>::iterator p = data.lower_bound(ex_it->start);
+    // p->first >= start
+    
+    off_t cur = ex_it->start;
+    off_t left = ex_it->length;
+    
+    if (p != data.begin() && 
+        (p == data.end() || p->first > cur)) {
+      p--;     // might overlap!
+      if (p->first + p->second->length() <= cur) 
+        p++;   // doesn't overlap.
+    }
+    
+    while (left > 0) {
+      // at end?
+      if (p == data.end()) {
+        // rest is a miss.
+        BufferHead *n = new BufferHead(this);
+        n->set_start( cur );
+        n->set_length( left );
+        oc->bh_add(this, n);
+        missing[cur] = n;
+        dout(20) << "map_read miss " << left << " left, " << *n << endl;
+        cur += left;
+        left -= left;
+        assert(left == 0);
+        assert(cur == ex_it->start + (off_t)ex_it->length);
+        break;  // no more.
+      }
+      
+      if (p->first <= cur) {
+        // have it (or part of it)
+        BufferHead *e = p->second;
+        
+        if (e->is_clean() ||
+            e->is_dirty() ||
+            e->is_tx()) {
+          hits[cur] = e;     // readable!
+          dout(20) << "map_read hit " << *e << endl;
+        } 
+        else if (e->is_rx()) {
+          rx[cur] = e;       // missing, not readable.
+          dout(20) << "map_read rx " << *e << endl;
+        }
+        else assert(0);
+        
+        off_t lenfromcur = MIN(e->end() - cur, left);
+        cur += lenfromcur;
+        left -= lenfromcur;
+        p++;
+        continue;  // more?
+        
+      } else if (p->first > cur) {
+        // gap.. miss
+        off_t next = p->first;
+        BufferHead *n = new BufferHead(this);
+        n->set_start( cur );
+        n->set_length( MIN(next - cur, left) );
+        oc->bh_add(this,n);
+        missing[cur] = n;
+        cur += MIN(left, n->length());
+        left -= MIN(left, n->length());
+        dout(20) << "map_read gap " << *n << endl;
+        continue;    // more?
+      }
+      else 
+        assert(0);
+    }
+  }
+  return(0);
+}
+
+/*
+ * map a range of extents on an object's buffer cache.
+ * - combine any bh's we're writing into one
+ * - break up bufferheads that don't fall completely within the range
+ * //no! - return a bh that includes the write.  may also include other dirty data to left and/or right.
+ */
+ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr)
+{
+  BufferHead *final = 0;
+
+  for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+       ex_it != wr->extents.end();
+       ex_it++) {
+    
+    if (ex_it->oid != oid) continue;
+    
+    dout(10) << "map_write oex " << ex_it->oid
+             << " " << ex_it->start << "~" << ex_it->length << endl;
+    
+    map<off_t, BufferHead*>::iterator p = data.lower_bound(ex_it->start);
+    // p->first >= start
+    
+    off_t cur = ex_it->start;
+    off_t left = ex_it->length;
+    
+    if (p != data.begin() && 
+        (p == data.end() || p->first > cur)) {
+      p--;     // might overlap or butt up!
+
+      /*// dirty and butts up?
+      if (p->first + p->second->length() == cur &&
+          p->second->is_dirty()) {
+        dout(10) << "map_write will append to tail of " << *p->second << endl;
+        final = p->second;
+      }
+      */
+      if (p->first + p->second->length() <= cur) 
+        p++;   // doesn't overlap.
+    }    
+    
+    while (left > 0) {
+      off_t max = left;
+
+      // at end ?
+      if (p == data.end()) {
+        if (final == NULL) {
+          final = new BufferHead(this);
+          final->set_start( cur );
+          final->set_length( max );
+          oc->bh_add(this, final);
+          dout(10) << "map_write adding trailing bh " << *final << endl;
+        } else {
+          final->set_length( final->length() + max );
+        }
+        left -= max;
+        cur += max;
+        continue;
+      }
+      
+      dout(10) << "p is " << *p->second << endl;
+
+      if (p->first <= cur) {
+        BufferHead *bh = p->second;
+        dout(10) << "map_write bh " << *bh << " intersected" << endl;
+        
+        /*if (bh->is_dirty()) {
+          // already dirty, let's use it.
+          final = bh;
+        } else {
+        */
+        if (p->first < cur) {
+          assert(final == 0);
+          if (cur + max >= p->first + p->second->length()) {
+            // we want right bit (one splice)
+            final = split(bh, cur);   // just split it, take right half.
+            p++;
+            assert(p->second == final);
+          } else {
+            // we want middle bit (two splices)
+            final = split(bh, cur);
+            p++;
+            assert(p->second == final);
+            split(final, cur+max);
+          }
+        } else if (p->first == cur) {
+          /*if (bh->is_dirty()) {
+              // already dirty, use it.
+            } 
+            else*/
+          if (p->second->length() <= max) {
+            // whole bufferhead, piece of cake.
+          } else {
+            // we want left bit (one splice)
+            split(bh, cur + max);        // just split
+          }
+          if (final) 
+            merge_left(final,bh);
+          else
+            final = bh;
+        }
+        
+        // keep going.
+        off_t lenfromcur = final->end() - cur;
+        cur += lenfromcur;
+        left -= lenfromcur;
+        p++;
+        continue; 
+      } else {
+        // gap!
+        off_t next = p->first;
+        off_t glen = MIN(next - cur, max);
+        dout(10) << "map_write gap " << cur << "~" << glen << endl;
+        if (final) {
+          final->set_length( final->length() + glen );
+        } else {
+          final = new BufferHead(this);
+          final->set_start( cur );
+          final->set_length( glen );
+          oc->bh_add(this, final);
+        }
+        
+        cur += glen;
+        left -= glen;
+        continue;    // more?
+      }
+    }
+  }
+  
+  // set versoin
+  assert(final);
+  dout(10) << "map_write final is " << *final << endl;
+
+  return final;
+}
+
+
+
+/*** ObjectCacher ***/
+
+#undef dout
+#define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << objecter->messenger->get_myaddr() << ".objectcacher "
+
+
+/* private */
+
+void ObjectCacher::bh_read(BufferHead *bh)
+{
+  dout(7) << "bh_read on " << *bh << endl;
+
+  mark_rx(bh);
+
+  // finisher
+  C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length());
+
+  // go
+  objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), &onfinish->bl,
+                 onfinish);
+}
+
+void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl)
+{
+  //lock.Lock();
+  dout(7) << "bh_read_finish " 
+          << oid 
+          << " " << start << "~" << length
+          << endl;
+  
+  if (objects.count(oid) == 0) {
+    dout(7) << "bh_read_finish no object cache" << endl;
+  } else {
+    Object *ob = objects[oid];
+    
+    // apply to bh's!
+    off_t opos = start;
+    map<off_t, BufferHead*>::iterator p = ob->data.lower_bound(opos);
+    
+    while (p != ob->data.end() &&
+           opos < start+(off_t)length) {
+      BufferHead *bh = p->second;
+      
+      if (bh->start() > opos) {
+        dout(1) << "weirdness: gap when applying read results, " 
+                << opos << "~" << bh->start() - opos 
+                << endl;
+        opos = bh->start();
+        continue;
+      }
+      
+      if (!bh->is_rx()) {
+        dout(10) << "bh_read_finish skipping non-rx " << *bh << endl;
+        opos = bh->end();
+        p++;
+        continue;
+      }
+      
+      assert(opos >= bh->start());
+      assert(bh->start() == opos);   // we don't merge rx bh's... yet!
+      assert(bh->length() <= start+(off_t)length-opos);
+      
+      bh->bl.substr_of(bl,
+                       opos-bh->start(),
+                       bh->length());
+      mark_clean(bh);
+      dout(10) << "bh_read_finish read " << *bh << endl;
+      
+      opos = bh->end();
+      p++;
+      
+      // finishers?
+      // called with lock held.
+      list<Context*> ls;
+      for (map<off_t, list<Context*> >::iterator p = bh->waitfor_read.begin();
+           p != bh->waitfor_read.end();
+           p++)
+        ls.splice(ls.end(), p->second);
+      bh->waitfor_read.clear();
+      finish_contexts(ls);
+    }
+  }
+  //lock.Unlock();
+}
+
+
+void ObjectCacher::bh_write(BufferHead *bh)
+{
+  dout(7) << "bh_write " << *bh << endl;
+  
+  // finishers
+  C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length());
+  C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length());
+
+  // go
+  tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->bl,
+                              onack, oncommit);
+
+  // set bh last_write_tid
+  onack->tid = tid;
+  oncommit->tid = tid;
+  bh->ob->last_write_tid = tid;
+  bh->last_write_tid = tid;
+
+  mark_tx(bh);
+}
+
+void ObjectCacher::lock_ack(list<object_t>& oids, tid_t tid)
+{
+  for (list<object_t>::iterator i = oids.begin();
+       i != oids.end();
+       i++) {
+    object_t oid = *i;
+
+    if (objects.count(oid) == 0) {
+      dout(7) << "lock_ack no object cache" << endl;
+      assert(0);
+    } 
+    
+    Object *ob = objects[oid];
+
+    list<Context*> ls;
+    
+    assert(tid <= ob->last_write_tid);
+    if (ob->last_write_tid == tid) {
+      dout(10) << "lock_ack " << *ob
+               << " tid " << tid << endl;
+
+      switch (ob->lock_state) {
+      case Object::LOCK_RDUNLOCKING: 
+      case Object::LOCK_WRUNLOCKING: 
+        ob->lock_state = Object::LOCK_NONE; 
+        break;
+      case Object::LOCK_RDLOCKING: 
+      case Object::LOCK_DOWNGRADING: 
+        ob->lock_state = Object::LOCK_RDLOCK; 
+        ls.splice(ls.begin(), ob->waitfor_rd);
+        break;
+      case Object::LOCK_UPGRADING: 
+      case Object::LOCK_WRLOCKING: 
+        ob->lock_state = Object::LOCK_WRLOCK; 
+        ls.splice(ls.begin(), ob->waitfor_wr);
+        ls.splice(ls.begin(), ob->waitfor_rd);
+        break;
+
+      default:
+        assert(0);
+      }
+      
+      ob->last_ack_tid = tid;
+      
+      if (ob->can_close())
+        close_object(ob);
+    } else {
+      dout(10) << "lock_ack " << *ob 
+               << " tid " << tid << " obsolete" << endl;
+    }
+
+    // waiters?
+    if (ob->waitfor_ack.count(tid)) {
+      ls.splice(ls.end(), ob->waitfor_ack[tid]);
+      ob->waitfor_ack.erase(tid);
+    }
+
+    finish_contexts(ls);
+
+  }
+}
+
+void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid)
+{
+  //lock.Lock();
+  
+  dout(7) << "bh_write_ack " 
+          << oid 
+          << " tid " << tid
+          << " " << start << "~" << length
+          << endl;
+  if (objects.count(oid) == 0) {
+    dout(7) << "bh_write_ack no object cache" << endl;
+    assert(0);
+  } else {
+    Object *ob = objects[oid];
+    
+    // apply to bh's!
+    for (map<off_t, BufferHead*>::iterator p = ob->data.lower_bound(start);
+         p != ob->data.end();
+         p++) {
+      BufferHead *bh = p->second;
+      
+      if (bh->start() > start+(off_t)length) break;
+
+      if (bh->start() < start &&
+          bh->end() > start+(off_t)length) {
+        dout(20) << "bh_write_ack skipping " << *bh << endl;
+        continue;
+      }
+      
+      // make sure bh is tx
+      if (!bh->is_tx()) {
+        dout(10) << "bh_write_ack skipping non-tx " << *bh << endl;
+        continue;
+      }
+      
+      // make sure bh tid matches
+      if (bh->last_write_tid != tid) {
+        assert(bh->last_write_tid > tid);
+        dout(10) << "bh_write_ack newer tid on " << *bh << endl;
+        continue;
+      }
+      
+      // ok!  mark bh clean.
+      mark_clean(bh);
+      dout(10) << "bh_write_ack clean " << *bh << endl;
+    }
+    
+    // update object last_ack.
+    assert(ob->last_ack_tid < tid);
+    ob->last_ack_tid = tid;
+    
+    // waiters?
+    if (ob->waitfor_ack.count(tid)) {
+      list<Context*> ls;
+      ls.splice(ls.begin(), ob->waitfor_ack[tid]);
+      ob->waitfor_ack.erase(tid);
+      finish_contexts(ls);
+    }
+  }
+  //lock.Unlock();
+}
+
+void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid)
+{
+  //lock.Lock();
+  
+  // update object last_commit
+  dout(7) << "bh_write_commit " 
+          << oid 
+          << " tid " << tid
+          << " " << start << "~" << length
+          << endl;
+  if (objects.count(oid) == 0) {
+    dout(7) << "bh_write_commit no object cache" << endl;
+    //assert(0);
+  } else {
+    Object *ob = objects[oid];
+    
+    // update last_commit.
+    ob->last_commit_tid = tid;
+    
+    // waiters?
+    if (ob->waitfor_commit.count(tid)) {
+      list<Context*> ls;
+      ls.splice(ls.begin(), ob->waitfor_commit[tid]);
+      ob->waitfor_commit.erase(tid);
+      finish_contexts(ls);
+    }
+  }
+
+  //  lock.Unlock();
+}
+
+
+void ObjectCacher::flush(off_t amount)
+{
+  utime_t cutoff = g_clock.now();
+  //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age;
+
+  dout(10) << "flush " << amount << endl;
+  
+  off_t did = 0;
+  while (amount == 0 || did < amount) {
+    BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire();
+    if (!bh) break;
+    if (bh->last_write > cutoff) break;
+
+    did += bh->length();
+    bh_write(bh);
+  }    
+}
+
+
+void ObjectCacher::trim(off_t max)
+{
+  if (max < 0) 
+    max = g_conf.client_oc_size;
+  
+  dout(10) << "trim  start: max " << max 
+           << "  clean " << get_stat_clean()
+           << endl;
+
+  while (get_stat_clean() > max) {
+    BufferHead *bh = (BufferHead*) lru_rest.lru_expire();
+    if (!bh) break;
+    
+    dout(10) << "trim trimming " << *bh << endl;
+    assert(bh->is_clean());
+    
+    Object *ob = bh->ob;
+    bh_remove(ob, bh);
+    delete bh;
+    
+    if (ob->can_close()) {
+      dout(10) << "trim trimming " << *ob << endl;
+      close_object(ob);
+    }
+  }
+  
+  dout(10) << "trim finish: max " << max 
+           << "  clean " << get_stat_clean()
+           << endl;
+}
+
+
+
+/* public */
+
+/*
+ * returns # bytes read (if in cache).  onfinish is untouched (caller must delete it)
+ * returns 0 if doing async read
+ */
+int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish)
+{
+  bool success = true;
+  list<BufferHead*> hit_ls;
+  map<size_t, bufferlist> stripe_map;  // final buffer offset -> substring
+
+  for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+       ex_it != rd->extents.end();
+       ex_it++) {
+    dout(10) << "readx " << *ex_it << endl;
+
+    // get Object cache
+    Object *o = get_object(ex_it->oid, ino);
+    
+    // map extent into bufferheads
+    map<off_t, BufferHead*> hits, missing, rx;
+    o->map_read(rd, hits, missing, rx);
+    
+    if (!missing.empty() || !rx.empty()) {
+      // read missing
+      for (map<off_t, BufferHead*>::iterator bh_it = missing.begin();
+           bh_it != missing.end();
+           bh_it++) {
+        bh_read(bh_it->second);
+        if (success) {
+          dout(10) << "readx missed, waiting on " << *bh_it->second 
+                   << " off " << bh_it->first << endl;
+          success = false;
+          bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) );
+        }
+      }
+
+      // bump rx
+      for (map<off_t, BufferHead*>::iterator bh_it = rx.begin();
+           bh_it != rx.end();
+           bh_it++) {
+        touch_bh(bh_it->second);        // bump in lru, so we don't lose it.
+        if (success) {
+          dout(10) << "readx missed, waiting on " << *bh_it->second 
+                   << " off " << bh_it->first << endl;
+          success = false;
+          bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) );
+        }
+      }      
+    } else {
+      assert(!hits.empty());
+
+      // make a plain list
+      for (map<off_t, BufferHead*>::iterator bh_it = hits.begin();
+           bh_it != hits.end();
+           bh_it++) {
+		dout(10) << "readx hit bh " << *bh_it->second << endl;
+        hit_ls.push_back(bh_it->second);
+	  }
+
+      // create reverse map of buffer offset -> object for the eventual result.
+      // this is over a single ObjectExtent, so we know that
+      //  - the bh's are contiguous
+      //  - the buffer frags need not be (and almost certainly aren't)
+      off_t opos = ex_it->start;
+      map<off_t, BufferHead*>::iterator bh_it = hits.begin();
+	  assert(bh_it->second->start() <= opos);
+      size_t bhoff = opos - bh_it->second->start();
+      map<size_t,size_t>::iterator f_it = ex_it->buffer_extents.begin();
+      size_t foff = 0;
+      while (1) {
+        BufferHead *bh = bh_it->second;
+        assert(opos == (off_t)(bh->start() + bhoff));
+
+        dout(10) << "readx rmap opos " << opos
+                 << ": " << *bh << " +" << bhoff
+                 << " frag " << f_it->first << "~" << f_it->second << " +" << foff
+                 << endl;
+
+        size_t len = MIN(f_it->second - foff,
+                         bh->length() - bhoff);
+        stripe_map[f_it->first].substr_of(bh->bl,
+                                          opos - bh->start(),
+                                          len);
+        opos += len;
+        bhoff += len;
+        foff += len;
+        if (opos == bh->end()) {
+          bh_it++;
+          bhoff = 0;
+        }
+        if (foff == f_it->second) {
+          f_it++;
+          foff = 0;
+        }
+        if (bh_it == hits.end()) break;
+        if (f_it == ex_it->buffer_extents.end()) break;
+      }
+      assert(f_it == ex_it->buffer_extents.end());
+      assert(opos == ex_it->start + (off_t)ex_it->length);
+    }
+  }
+  
+  // bump hits in lru
+  for (list<BufferHead*>::iterator bhit = hit_ls.begin();
+       bhit != hit_ls.end();
+       bhit++) 
+    touch_bh(*bhit);
+  
+  if (!success) return 0;  // wait!
+
+  // no misses... success!  do the read.
+  assert(!hit_ls.empty());
+  dout(10) << "readx has all buffers" << endl;
+  
+  // ok, assemble into result buffer.
+  rd->bl->clear();
+  size_t pos = 0;
+  for (map<size_t,bufferlist>::iterator i = stripe_map.begin();
+       i != stripe_map.end();
+       i++) {
+    assert(pos == i->first);
+    dout(10) << "readx  adding buffer len " << i->second.length() << " at " << pos << endl;
+    pos += i->second.length();
+    rd->bl->claim_append(i->second);
+  }
+  dout(10) << "readx  result is " << rd->bl->length() << endl;
+
+  trim();
+  
+  return pos;
+}
+
+
+int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino)
+{
+  utime_t now = g_clock.now();
+  
+  for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+       ex_it != wr->extents.end();
+       ex_it++) {
+    // get object cache
+    Object *o = get_object(ex_it->oid, ino);
+
+    // map it all into a single bufferhead.
+    BufferHead *bh = o->map_write(wr);
+    
+    // adjust buffer pointers (ie "copy" data into my cache)
+    // this is over a single ObjectExtent, so we know that
+    //  - there is one contiguous bh
+    //  - the buffer frags need not be (and almost certainly aren't)
+    // note: i assume striping is monotonic... no jumps backwards, ever!
+    off_t opos = ex_it->start;
+    for (map<size_t,size_t>::iterator f_it = ex_it->buffer_extents.begin();
+         f_it != ex_it->buffer_extents.end();
+         f_it++) {
+      dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl;
+      size_t bhoff = bh->start() - opos;
+      assert(f_it->second <= bh->length() - bhoff);
+
+      bufferlist frag; 
+      frag.substr_of(wr->bl, 
+                     f_it->first, f_it->second);
+
+      bh->bl.claim_append(frag);
+      opos += f_it->second;
+    }
+
+    // it's dirty.
+    mark_dirty(bh);
+    touch_bh(bh);
+    bh->last_write = now;
+
+    // recombine with left?
+    map<off_t,BufferHead*>::iterator p = o->data.find(bh->start());
+    if (p != o->data.begin()) {
+      p--;
+      if (p->second->is_dirty()) {
+        o->merge_left(p->second,bh);
+        bh = p->second;
+      }
+    }
+    // right?
+    p = o->data.find(bh->start());
+    p++;
+    if (p != o->data.end() &&
+        p->second->is_dirty()) 
+      o->merge_left(p->second,bh);
+  }
+
+  delete wr;
+
+  trim();
+  return 0;
+}
+ 
+
+// blocking wait for write.
+void ObjectCacher::wait_for_write(size_t len, Mutex& lock)
+{
+  while (get_stat_dirty() > g_conf.client_oc_max_dirty) {
+    dout(10) << "wait_for_write waiting" << endl;
+    flusher_cond.Signal();
+    stat_waiter++;
+    stat_cond.Wait(lock);
+    stat_waiter--;
+    dout(10) << "wait_for_write woke up" << endl;
+  }
+}
+
+void ObjectCacher::flusher_entry()
+{
+  dout(10) << "flusher start" << endl;
+  lock.Lock();
+  while (!flusher_stop) {
+    while (!flusher_stop) {
+      off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty();
+      dout(11) << "flusher "
+               << all << " / " << g_conf.client_oc_size << ":  "
+               << get_stat_tx() << " tx, "
+               << get_stat_rx() << " rx, "
+               << get_stat_clean() << " clean, "
+               << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty"
+               << endl;
+      if (get_stat_dirty() > g_conf.client_oc_max_dirty) {
+        // flush some dirty pages
+        dout(10) << "flusher " 
+                 << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty,"
+                 << " flushing some dirty bhs" << endl;
+        flush(get_stat_dirty() - g_conf.client_oc_max_dirty);
+      }
+      else {
+        // check tail of lru for old dirty items
+        utime_t cutoff = g_clock.now();
+        cutoff.sec_ref()--;
+        BufferHead *bh = 0;
+        while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 &&
+               bh->last_write < cutoff) {
+          dout(10) << "flusher flushing aged dirty bh " << *bh << endl;
+          bh_write(bh);
+        }
+        break;
+      }
+    }
+    if (flusher_stop) break;
+    flusher_cond.WaitInterval(lock, utime_t(1,0));
+  }
+  lock.Unlock();
+  dout(10) << "flusher finish" << endl;
+}
+
+
+  
+// blocking.  atomic+sync.
+int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock)
+{
+  dout(10) << "atomic_sync_readx " << rd
+           << " in " << ino
+           << endl;
+
+  if (rd->extents.size() == 1) {
+    // single object.
+    // just write synchronously.
+    Cond cond;
+    bool done = false;
+    objecter->readx(rd, new C_SafeCond(&lock, &cond, &done));
+
+    // block
+    while (!done) cond.Wait(lock);
+  } else {
+    // spans multiple objects, or is big.
+
+    // sort by object...
+    map<object_t,ObjectExtent> by_oid;
+    for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+         ex_it != rd->extents.end();
+         ex_it++) 
+      by_oid[ex_it->oid] = *ex_it;
+    
+    // lock
+    for (map<object_t,ObjectExtent>::iterator i = by_oid.begin();
+         i != by_oid.end();
+         i++) {
+      Object *o = get_object(i->first, ino);
+      rdlock(o);
+    }
+
+    // readx will hose rd
+    list<ObjectExtent> extents = rd->extents;
+
+    // do the read, into our cache
+    Cond cond;
+    bool done = false;
+    readx(rd, ino, new C_SafeCond(&lock, &cond, &done));
+    
+    // block
+    while (!done) cond.Wait(lock);
+    
+    // release the locks
+    for (list<ObjectExtent>::iterator ex_it = extents.begin();
+         ex_it != extents.end();
+         ex_it++) {
+      assert(objects.count(ex_it->oid));
+      Object *o = objects[ex_it->oid];
+      rdunlock(o);
+    }
+  }
+
+  return 0;
+}
+
+int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock)
+{
+  dout(10) << "atomic_sync_writex " << wr
+           << " in " << ino
+           << endl;
+
+  if (wr->extents.size() == 1 &&
+      wr->extents.front().length <= g_conf.client_oc_max_sync_write) {
+    // single object.
+    
+    // make sure we aren't already locking/locked...
+    object_t oid = wr->extents.front().oid;
+    Object *o = 0;
+    if (objects.count(oid)) o = get_object(oid, ino);
+    if (!o || 
+        (o->lock_state != Object::LOCK_WRLOCK &&
+         o->lock_state != Object::LOCK_WRLOCKING &&
+         o->lock_state != Object::LOCK_UPGRADING)) {
+      // just write synchronously.
+      dout(10) << "atomic_sync_writex " << wr
+               << " in " << ino
+               << " doing sync write"
+               << endl;
+
+      Cond cond;
+      bool done = false;
+      objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0);
+      
+      // block
+      while (!done) cond.Wait(lock);
+      return 0;
+    }
+  } 
+
+  // spans multiple objects, or is big.
+  // sort by object...
+  map<object_t,ObjectExtent> by_oid;
+  for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+       ex_it != wr->extents.end();
+       ex_it++) 
+    by_oid[ex_it->oid] = *ex_it;
+  
+  // wrlock
+  for (map<object_t,ObjectExtent>::iterator i = by_oid.begin();
+       i != by_oid.end();
+       i++) {
+    Object *o = get_object(i->first, ino);
+    wrlock(o);
+  }
+  
+  // writex will hose wr
+  list<ObjectExtent> extents = wr->extents;
+
+  // do the write, into our cache
+  writex(wr, ino);
+  
+  // flush 
+  // ...and release the locks?
+  for (list<ObjectExtent>::iterator ex_it = extents.begin();
+       ex_it != extents.end();
+       ex_it++) {
+    assert(objects.count(ex_it->oid));
+    Object *o = objects[ex_it->oid];
+    
+    wrunlock(o);
+  }
+
+  return 0;
+}
+ 
+
+
+// locking -----------------------------
+
+void ObjectCacher::rdlock(Object *o)
+{
+  // lock?
+  if (o->lock_state == Object::LOCK_NONE ||
+      o->lock_state == Object::LOCK_RDUNLOCKING ||
+      o->lock_state == Object::LOCK_WRUNLOCKING) {
+    dout(10) << "rdlock rdlock " << *o << endl;
+    
+    o->lock_state = Object::LOCK_RDLOCKING;
+    
+    C_LockAck *ack = new C_LockAck(this, o->get_oid());
+    C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+    
+    commit->tid = 
+      ack->tid = 
+      o->last_write_tid = 
+      objecter->lock(OSD_OP_RDLOCK, o->get_oid(), ack, commit);
+  }
+  
+  // stake our claim.
+  o->rdlock_ref++;  
+  
+  // wait?
+  if (o->lock_state == Object::LOCK_RDLOCKING ||
+      o->lock_state == Object::LOCK_WRLOCKING) {
+    dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl;
+    Cond cond;
+    bool done = false;
+    o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done));
+    while (!done) cond.Wait(lock);
+  }
+  assert(o->lock_state == Object::LOCK_RDLOCK ||
+         o->lock_state == Object::LOCK_WRLOCK ||
+         o->lock_state == Object::LOCK_UPGRADING ||
+         o->lock_state == Object::LOCK_DOWNGRADING);
+}
+
+void ObjectCacher::wrlock(Object *o)
+{
+  // lock?
+  if (o->lock_state != Object::LOCK_WRLOCK &&
+      o->lock_state != Object::LOCK_WRLOCKING &&
+      o->lock_state != Object::LOCK_UPGRADING) {
+    dout(10) << "wrlock wrlock " << *o << endl;
+    
+    int op = 0;
+    if (o->lock_state == Object::LOCK_RDLOCK) {
+      o->lock_state = Object::LOCK_UPGRADING;
+      op = OSD_OP_UPLOCK;
+    } else {
+      o->lock_state = Object::LOCK_WRLOCKING;
+      op = OSD_OP_WRLOCK;
+    }
+    
+    C_LockAck *ack = new C_LockAck(this, o->get_oid());
+    C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+    
+    commit->tid = 
+      ack->tid = 
+      o->last_write_tid = 
+      objecter->lock(op, o->get_oid(), ack, commit);
+  }
+  
+  // stake our claim.
+  o->wrlock_ref++;  
+  
+  // wait?
+  if (o->lock_state == Object::LOCK_WRLOCKING ||
+      o->lock_state == Object::LOCK_UPGRADING) {
+    dout(10) << "wrlock waiting for wrlock on " << *o << endl;
+    Cond cond;
+    bool done = false;
+    o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done));
+    while (!done) cond.Wait(lock);
+  }
+  assert(o->lock_state == Object::LOCK_WRLOCK);
+}
+
+
+void ObjectCacher::rdunlock(Object *o)
+{
+  dout(10) << "rdunlock " << *o << endl;
+  assert(o->lock_state == Object::LOCK_RDLOCK ||
+         o->lock_state == Object::LOCK_WRLOCK ||
+         o->lock_state == Object::LOCK_UPGRADING ||
+         o->lock_state == Object::LOCK_DOWNGRADING);
+
+  assert(o->rdlock_ref > 0);
+  o->rdlock_ref--;
+  if (o->rdlock_ref > 0 ||
+      o->wrlock_ref > 0) {
+    dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl;
+    return;
+  }
+
+  release(o);  // release first
+
+  o->lock_state = Object::LOCK_RDUNLOCKING;
+
+  C_LockAck *lockack = new C_LockAck(this, o->get_oid());
+  C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+  commit->tid = 
+    lockack->tid = 
+    o->last_write_tid = 
+    objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), lockack, commit);
+}
+
+void ObjectCacher::wrunlock(Object *o)
+{
+  dout(10) << "wrunlock " << *o << endl;
+  assert(o->lock_state == Object::LOCK_WRLOCK);
+
+  assert(o->wrlock_ref > 0);
+  o->wrlock_ref--;
+  if (o->wrlock_ref > 0) {
+    dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl;
+    return;
+  }
+
+  flush(o);  // flush first
+
+  int op = 0;
+  if (o->rdlock_ref > 0) {
+    dout(10) << "wrunlock rdlock " << *o << endl;
+    op = OSD_OP_DNLOCK;
+    o->lock_state = Object::LOCK_DOWNGRADING;
+  } else {
+    dout(10) << "wrunlock wrunlock " << *o << endl;
+    op = OSD_OP_WRUNLOCK;
+    o->lock_state = Object::LOCK_WRUNLOCKING;
+  }
+
+  C_LockAck *lockack = new C_LockAck(this, o->get_oid());
+  C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+  commit->tid = 
+    lockack->tid = 
+    o->last_write_tid = 
+    objecter->lock(op, o->get_oid(), lockack, commit);
+}
+
+
+// -------------------------------------------------
+
+
+bool ObjectCacher::set_is_cached(inodeno_t ino)
+{
+  if (objects_by_ino.count(ino) == 0) 
+    return false;
+  
+  set<Object*>& s = objects_by_ino[ino];
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+    if (!ob->data.empty()) return true;
+  }
+
+  return false;
+}
+
+bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino)
+{
+  if (objects_by_ino.count(ino) == 0) 
+    return false;
+  
+  set<Object*>& s = objects_by_ino[ino];
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+
+    for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+         p != ob->data.end();
+         p++) {
+      BufferHead *bh = p->second;
+      if (bh->is_dirty() || bh->is_tx()) 
+        return true;
+    }
+  }  
+
+  return false;
+}
+
+
+// flush.  non-blocking.  no callback.
+// true if clean, already flushed.  
+// false if we wrote something.
+bool ObjectCacher::flush(Object *ob)
+{
+  bool clean = true;
+  for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+       p != ob->data.end();
+       p++) {
+    BufferHead *bh = p->second;
+    if (bh->is_tx()) {
+      clean = false;
+      continue;
+    }
+    if (!bh->is_dirty()) continue;
+    
+    bh_write(bh);
+    clean = false;
+  }
+  return clean;
+}
+
+// flush.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish)
+{
+  if (objects_by_ino.count(ino) == 0) {
+    dout(10) << "flush_set on " << ino << " dne" << endl;
+    return true;
+  }
+
+  dout(10) << "flush_set " << ino << endl;
+
+  C_Gather *gather = 0; // we'll need to wait for all objects to flush!
+
+  set<Object*>& s = objects_by_ino[ino];
+  bool safe = true;
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+
+    if (!flush(ob)) {
+      // we'll need to gather...
+      if (!gather && onfinish) 
+        gather = new C_Gather(onfinish);
+      safe = false;
+
+      dout(10) << "flush_set " << ino << " will wait for ack tid " 
+               << ob->last_write_tid 
+               << " on " << *ob
+               << endl;
+      if (gather)
+        ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub());
+    }
+  }
+  
+  if (safe) {
+    dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl;
+    return true;
+  }
+  return false;
+}
+
+
+// commit.  non-blocking, takes callback.
+// return true if already flushed.
+bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish)
+{
+  assert(onfinish);  // doesn't make any sense otherwise.
+
+  if (objects_by_ino.count(ino) == 0) {
+    dout(10) << "commit_set on " << ino << " dne" << endl;
+    return true;
+  }
+
+  dout(10) << "commit_set " << ino << endl;
+
+  C_Gather *gather = 0; // we'll need to wait for all objects to commit
+
+  set<Object*>& s = objects_by_ino[ino];
+  bool safe = true;
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+    
+    // make sure it's flushing.
+    flush_set(ino);
+
+    if (ob->last_write_tid > ob->last_commit_tid) {
+      dout(10) << "commit_set " << ino << " " << *ob 
+               << " will finish on commit tid " << ob->last_write_tid
+               << endl;
+      if (!gather && onfinish) gather = new C_Gather(onfinish);
+      safe = false;
+      if (gather)
+        ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() );
+    }
+  }
+
+  if (safe) {
+    dout(10) << "commit_set " << ino << " all committed" << endl;
+    return true;
+  }
+  return false;
+}
+
+
+off_t ObjectCacher::release(Object *ob)
+{
+  list<BufferHead*> clean;
+  off_t o_unclean = 0;
+
+  for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+       p != ob->data.end();
+       p++) {
+    BufferHead *bh = p->second;
+    if (bh->is_clean()) 
+	  clean.push_back(bh);
+    else 
+      o_unclean += bh->length();
+  }
+
+  for (list<BufferHead*>::iterator p = clean.begin();
+	   p != clean.end();
+	   p++)
+	bh_remove(ob, *p);
+
+  return o_unclean;
+}
+
+off_t ObjectCacher::release_set(inodeno_t ino)
+{
+  // return # bytes not clean (and thus not released).
+  off_t unclean = 0;
+
+  if (objects_by_ino.count(ino) == 0) {
+    dout(10) << "release_set on " << ino << " dne" << endl;
+    return 0;
+  }
+
+  dout(10) << "release_set " << ino << endl;
+
+  set<Object*>& s = objects_by_ino[ino];
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+    
+    off_t o_unclean = release(ob);
+    unclean += o_unclean;
+
+    if (o_unclean) 
+      dout(10) << "release_set " << ino << " " << *ob 
+               << " has " << o_unclean << " bytes left"
+               << endl;
+    
+  }
+
+  if (unclean) {
+    dout(10) << "release_set " << ino
+             << ", " << unclean << " bytes left" << endl;
+  }
+
+  return unclean;
+}
+
+
+void ObjectCacher::kick_sync_writers(inodeno_t ino)
+{
+  if (objects_by_ino.count(ino) == 0) {
+    dout(10) << "kick_sync_writers on " << ino << " dne" << endl;
+    return;
+  }
+
+  dout(10) << "kick_sync_writers on " << ino << endl;
+
+  list<Context*> ls;
+
+  set<Object*>& s = objects_by_ino[ino];
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+    
+    ls.splice(ls.begin(), ob->waitfor_wr);
+  }
+
+  finish_contexts(ls);
+}
+
+void ObjectCacher::kick_sync_readers(inodeno_t ino)
+{
+  if (objects_by_ino.count(ino) == 0) {
+    dout(10) << "kick_sync_readers on " << ino << " dne" << endl;
+    return;
+  }
+
+  dout(10) << "kick_sync_readers on " << ino << endl;
+
+  list<Context*> ls;
+
+  set<Object*>& s = objects_by_ino[ino];
+  for (set<Object*>::iterator i = s.begin();
+       i != s.end();
+       i++) {
+    Object *ob = *i;
+    
+    ls.splice(ls.begin(), ob->waitfor_rd);
+  }
+
+  finish_contexts(ls);
+}
+
+
+
diff --git a/branches/sage/cephmds2/osdc/ObjectCacher.h b/branches/sage/cephmds2/osdc/ObjectCacher.h
new file mode 100644
index 0000000000000..27b154023209d
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/ObjectCacher.h
@@ -0,0 +1,547 @@
+#ifndef __OBJECTCACHER_H_
+#define __OBJECTCACHER_H_
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/Context.h"
+
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Objecter.h"
+#include "Filer.h"
+
+class Objecter;
+class Objecter::OSDRead;
+class Objecter::OSDWrite;
+
+class ObjectCacher {
+ public:
+
+  class Object;
+
+  // ******* BufferHead *********
+  class BufferHead : public LRUObject {
+  public:
+    // states
+    static const int STATE_MISSING = 0;
+    static const int STATE_CLEAN = 1;
+    static const int STATE_DIRTY = 2;
+    static const int STATE_RX = 3;
+    static const int STATE_TX = 4;
+    
+  private:
+    // my fields
+    int state;
+    int ref;
+    struct {
+      off_t start, length;   // bh extent in object
+    } ex;
+        
+  public:
+    Object *ob;
+    bufferlist  bl;
+    tid_t last_write_tid;  // version of bh (if non-zero)
+    utime_t last_write;
+    
+    map< off_t, list<Context*> > waitfor_read;
+    
+  public:
+    // cons
+    BufferHead(Object *o) : 
+      state(STATE_MISSING),
+      ref(0),
+      ob(o),
+      last_write_tid(0) {}
+  
+    // extent
+    off_t start() { return ex.start; }
+    void set_start(off_t s) { ex.start = s; }
+    off_t length() { return ex.length; }
+    void set_length(off_t l) { ex.length = l; }
+    off_t end() { return ex.start + ex.length; }
+    off_t last() { return end() - 1; }
+
+    // states
+    void set_state(int s) {
+      if (s == STATE_RX || s == STATE_TX) get();
+      if (state == STATE_RX || state == STATE_TX) put();
+      state = s;
+    }
+    int get_state() { return state; }
+    
+    bool is_missing() { return state == STATE_MISSING; }
+    bool is_dirty() { return state == STATE_DIRTY; }
+    bool is_clean() { return state == STATE_CLEAN; }
+    bool is_tx() { return state == STATE_TX; }
+    bool is_rx() { return state == STATE_RX; }
+    
+    // reference counting
+    int get() {
+      assert(ref >= 0);
+      if (ref == 0) lru_pin();
+      return ++ref;
+    }
+    int put() {
+      assert(ref > 0);
+      if (ref == 1) lru_unpin();
+      --ref;
+      return ref;
+    }
+  };
+  
+
+  // ******* Object *********
+  class Object {
+  private:
+    // ObjectCacher::Object fields
+    ObjectCacher *oc;
+    object_t  oid;   // this _always_ is oid.rev=0
+    inodeno_t ino;
+	objectrev_t rev; // last rev we're written
+    
+  public:
+    map<off_t, BufferHead*>     data;
+
+    tid_t last_write_tid;  // version of bh (if non-zero)
+    tid_t last_ack_tid;    // last update acked.
+    tid_t last_commit_tid; // last update commited.
+
+    map< tid_t, list<Context*> > waitfor_ack;
+    map< tid_t, list<Context*> > waitfor_commit;
+    list<Context*> waitfor_rd;
+    list<Context*> waitfor_wr;
+
+    // lock
+    static const int LOCK_NONE = 0;
+    static const int LOCK_WRLOCKING = 1;
+    static const int LOCK_WRLOCK = 2;
+    static const int LOCK_WRUNLOCKING = 3;
+    static const int LOCK_RDLOCKING = 4;
+    static const int LOCK_RDLOCK = 5;
+    static const int LOCK_RDUNLOCKING = 6;
+    static const int LOCK_UPGRADING = 7;    // rd -> wr
+    static const int LOCK_DOWNGRADING = 8;  // wr -> rd
+    int lock_state;
+    int wrlock_ref;  // how many ppl want or are using a WRITE lock
+    int rdlock_ref;  // how many ppl want or are using a READ lock
+
+  public:
+    Object(ObjectCacher *_oc, object_t o, inodeno_t i) : 
+      oc(_oc),
+      oid(o), ino(i), 
+      last_write_tid(0), last_ack_tid(0), last_commit_tid(0),
+      lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0)
+      {}
+
+    object_t get_oid() { return oid; }
+    inodeno_t get_ino() { return ino; }
+
+    bool can_close() {
+      return data.empty() && lock_state == LOCK_NONE &&
+        waitfor_ack.empty() && waitfor_commit.empty() &&
+        waitfor_rd.empty() && waitfor_wr.empty();
+    }
+
+    // bh
+    void add_bh(BufferHead *bh) {
+      // add to my map
+      assert(data.count(bh->start()) == 0);
+      
+      if (0) {  // sanity check     FIXME DEBUG
+        //cout << "add_bh " << bh->start() << "~" << bh->length() << endl;
+        map<off_t,BufferHead*>::iterator p = data.lower_bound(bh->start());
+        if (p != data.end()) {
+          //cout << " after " << *p->second << endl;
+          //cout << " after starts at " << p->first << endl;
+          assert(p->first >= bh->end());
+        }
+        if (p != data.begin()) {
+          p--;
+          //cout << " before starts at " << p->second->start() 
+          //<< " and ends at " << p->second->end() << endl;
+          //cout << " before " << *p->second << endl;
+          assert(p->second->end() <= bh->start());
+        }
+      }
+
+      data[bh->start()] = bh;
+    }
+    void remove_bh(BufferHead *bh) {
+      assert(data.count(bh->start()));
+      data.erase(bh->start());
+    }
+    bool is_empty() { return data.empty(); }
+
+    // mid-level
+    BufferHead *split(BufferHead *bh, off_t off);
+    void merge_left(BufferHead *left, BufferHead *right);
+    void merge_right(BufferHead *left, BufferHead *right);
+
+    int map_read(Objecter::OSDRead *rd,
+                 map<off_t, BufferHead*>& hits,
+                 map<off_t, BufferHead*>& missing,
+                 map<off_t, BufferHead*>& rx);
+    BufferHead *map_write(Objecter::OSDWrite *wr);
+    
+  };
+  
+  // ******* ObjectCacher *********
+  // ObjectCacher fields
+ public:
+  Objecter *objecter;
+  Filer filer;
+
+ private:
+  Mutex& lock;
+  
+  hash_map<object_t, Object*> objects;
+  hash_map<inodeno_t, set<Object*> > objects_by_ino;
+
+  set<BufferHead*>    dirty_bh;
+  LRU   lru_dirty, lru_rest;
+
+  Cond flusher_cond;
+  bool flusher_stop;
+  void flusher_entry();
+  class FlusherThread : public Thread {
+    ObjectCacher *oc;
+  public:
+    FlusherThread(ObjectCacher *o) : oc(o) {}
+    void *entry() {
+      oc->flusher_entry();
+      return 0;
+    }
+  } flusher_thread;
+  
+
+  // objects
+  Object *get_object(object_t oid, inodeno_t ino) {
+    // have it?
+    if (objects.count(oid))
+      return objects[oid];
+
+    // create it.
+    Object *o = new Object(this, oid, ino);
+    objects[oid] = o;
+    objects_by_ino[ino].insert(o);
+    return o;
+  }
+  void close_object(Object *ob) {
+    assert(ob->can_close());
+
+    // ok!
+    objects.erase(ob->get_oid());
+    objects_by_ino[ob->get_ino()].erase(ob);
+    if (objects_by_ino[ob->get_ino()].empty())
+      objects_by_ino.erase(ob->get_ino());
+    delete ob;
+  }
+
+  // bh stats
+  Cond  stat_cond;
+  int   stat_waiter;
+
+  off_t stat_clean;
+  off_t stat_dirty;
+  off_t stat_rx;
+  off_t stat_tx;
+  off_t stat_missing;
+
+  void bh_stat_add(BufferHead *bh) {
+    switch (bh->get_state()) {
+    case BufferHead::STATE_MISSING: stat_missing += bh->length(); break;
+    case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break;
+    case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break;
+    case BufferHead::STATE_TX: stat_tx += bh->length(); break;
+    case BufferHead::STATE_RX: stat_rx += bh->length(); break;
+    }
+    if (stat_waiter) stat_cond.Signal();
+  }
+  void bh_stat_sub(BufferHead *bh) {
+    switch (bh->get_state()) {
+    case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break;
+    case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break;
+    case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break;
+    case BufferHead::STATE_TX: stat_tx -= bh->length(); break;
+    case BufferHead::STATE_RX: stat_rx -= bh->length(); break;
+    }
+  }
+  off_t get_stat_tx() { return stat_tx; }
+  off_t get_stat_rx() { return stat_rx; }
+  off_t get_stat_dirty() { return stat_dirty; }
+  off_t get_stat_clean() { return stat_clean; }
+
+  void touch_bh(BufferHead *bh) {
+    if (bh->is_dirty())
+      lru_dirty.lru_touch(bh);
+    else
+      lru_rest.lru_touch(bh);
+  }
+
+  // bh states
+  void bh_set_state(BufferHead *bh, int s) {
+    // move between lru lists?
+    if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) {
+      lru_rest.lru_remove(bh);
+      lru_dirty.lru_insert_top(bh);
+      dirty_bh.insert(bh);
+    }
+    if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) {
+      lru_dirty.lru_remove(bh);
+      lru_rest.lru_insert_mid(bh);
+      dirty_bh.erase(bh);
+    }
+
+    // set state
+    bh_stat_sub(bh);
+    bh->set_state(s);
+    bh_stat_add(bh);
+  }      
+
+  void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { 
+    bh_set_state(bh2, bh1->get_state());
+  }
+  
+  void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); };
+  void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); };
+  void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); };
+  void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); };
+  void mark_dirty(BufferHead *bh) { 
+    bh_set_state(bh, BufferHead::STATE_DIRTY); 
+    lru_dirty.lru_touch(bh);
+    //bh->set_dirty_stamp(g_clock.now());
+  };
+
+  void bh_add(Object *ob, BufferHead *bh) {
+    ob->add_bh(bh);
+    if (bh->is_dirty())
+      lru_dirty.lru_insert_top(bh);
+    else
+      lru_rest.lru_insert_top(bh);
+    bh_stat_add(bh);
+  }
+  void bh_remove(Object *ob, BufferHead *bh) {
+    ob->remove_bh(bh);
+    if (bh->is_dirty())
+      lru_dirty.lru_remove(bh);
+    else
+      lru_rest.lru_remove(bh);
+    bh_stat_sub(bh);
+  }
+
+  // io
+  void bh_read(BufferHead *bh);
+  void bh_write(BufferHead *bh);
+
+  void trim(off_t max=-1);
+  void flush(off_t amount=0);
+
+  bool flush(Object *o);
+  off_t release(Object *o);
+
+  void rdlock(Object *o);
+  void rdunlock(Object *o);
+  void wrlock(Object *o);
+  void wrunlock(Object *o);
+
+ public:
+  void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl);
+  void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t);
+  void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t);
+  void lock_ack(list<object_t>& oids, tid_t tid);
+
+  class C_ReadFinish : public Context {
+    ObjectCacher *oc;
+    object_t oid;
+    off_t start;
+    size_t length;
+  public:
+    bufferlist bl;
+    C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+    void finish(int r) {
+      oc->bh_read_finish(oid, start, length, bl);
+    }
+  };
+
+  class C_WriteAck : public Context {
+    ObjectCacher *oc;
+    object_t oid;
+    off_t start;
+    size_t length;
+  public:
+    tid_t tid;
+    C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+    void finish(int r) {
+      oc->bh_write_ack(oid, start, length, tid);
+    }
+  };
+  class C_WriteCommit : public Context {
+    ObjectCacher *oc;
+    object_t oid;
+    off_t start;
+    size_t length;
+  public:
+    tid_t tid;
+    C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+    void finish(int r) {
+      oc->bh_write_commit(oid, start, length, tid);
+    }
+  };
+
+  class C_LockAck : public Context {
+    ObjectCacher *oc;
+  public:
+    list<object_t> oids;
+    tid_t tid;
+    C_LockAck(ObjectCacher *c, object_t o) : oc(c) {
+      oids.push_back(o);
+    }
+    void finish(int r) {
+      oc->lock_ack(oids, tid);
+    }
+  };
+
+
+
+ public:
+  ObjectCacher(Objecter *o, Mutex& l) : 
+    objecter(o), filer(o), lock(l),
+    flusher_stop(false), flusher_thread(this),
+    stat_waiter(0),
+    stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) {
+    flusher_thread.create();
+  }
+  ~ObjectCacher() {
+    //lock.Lock();  // hmm.. watch out for deadlock!
+    flusher_stop = true;
+    flusher_cond.Signal();
+    //lock.Unlock();
+    flusher_thread.join();
+  }
+
+
+  class C_RetryRead : public Context {
+    ObjectCacher *oc;
+    Objecter::OSDRead *rd;
+    inodeno_t ino;
+    Context *onfinish;
+  public:
+    C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {}
+    void finish(int) {
+      int r = oc->readx(rd, ino, onfinish);
+      if (r > 0) {
+        onfinish->finish(r);
+        delete onfinish;
+      }
+    }
+  };
+
+  // non-blocking.  async.
+  int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish);
+  int writex(Objecter::OSDWrite *wr, inodeno_t ino);
+
+  // write blocking
+  void wait_for_write(size_t len, Mutex& lock);
+  
+  // blocking.  atomic+sync.
+  int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock);
+  int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock);
+
+  bool set_is_cached(inodeno_t ino);
+  bool set_is_dirty_or_committing(inodeno_t ino);
+
+  bool flush_set(inodeno_t ino, Context *onfinish=0);
+  void flush_all(Context *onfinish=0);
+
+  bool commit_set(inodeno_t ino, Context *oncommit);
+  void commit_all(Context *oncommit=0);
+
+  off_t release_set(inodeno_t ino);  // returns # of bytes not released (ie non-clean)
+
+  void kick_sync_writers(inodeno_t ino);
+  void kick_sync_readers(inodeno_t ino);
+
+
+  // file functions
+
+  /*** async+caching (non-blocking) file interface ***/
+  int file_read(inode_t& inode,
+                off_t offset, size_t len, 
+                bufferlist *bl,
+                Context *onfinish) {
+    Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+    filer.file_to_extents(inode, offset, len, rd->extents);
+    return readx(rd, inode.ino, onfinish);
+  }
+
+  int file_write(inode_t& inode,
+                 off_t offset, size_t len, 
+                 bufferlist& bl,
+				 objectrev_t rev=0) {
+    Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+    filer.file_to_extents(inode, offset, len, wr->extents);
+    return writex(wr, inode.ino);
+  }
+
+
+
+  /*** sync+blocking file interface ***/
+  
+  int file_atomic_sync_read(inode_t& inode,
+                            off_t offset, size_t len, 
+                            bufferlist *bl,
+                            Mutex &lock) {
+    Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+    filer.file_to_extents(inode, offset, len, rd->extents);
+    return atomic_sync_readx(rd, inode.ino, lock);
+  }
+
+  int file_atomic_sync_write(inode_t& inode,
+                             off_t offset, size_t len, 
+                             bufferlist& bl,
+                             Mutex &lock,
+							 objectrev_t rev=0) {
+    Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+    filer.file_to_extents(inode, offset, len, wr->extents);
+    return atomic_sync_writex(wr, inode.ino, lock);
+  }
+
+};
+
+
+inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh)
+{
+  out << "bh["
+      << bh.start() << "~" << bh.length()
+      << " (" << bh.bl.length() << ")"
+      << " v " << bh.last_write_tid;
+  if (bh.is_tx()) out << " tx";
+  if (bh.is_rx()) out << " rx";
+  if (bh.is_dirty()) out << " dirty";
+  if (bh.is_clean()) out << " clean";
+  if (bh.is_missing()) out << " missing";
+  out << "]";
+  return out;
+}
+
+inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob)
+{
+  out << "object["
+      << hex << ob.get_oid() << " ino " << ob.get_ino() << dec
+      << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid;
+
+  switch (ob.lock_state) {
+  case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break;
+  case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break;
+  case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break;
+  case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break;
+  case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break;
+  case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break;
+  }
+
+  out << "]";
+  return out;
+}
+
+#endif
diff --git a/branches/sage/cephmds2/osdc/Objecter.cc b/branches/sage/cephmds2/osdc/Objecter.cc
new file mode 100644
index 0000000000000..5e56781a20569
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Objecter.cc
@@ -0,0 +1,831 @@
+
+#include "Objecter.h"
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+
+#include "messages/MOSDFailure.h"
+
+#include <errno.h>
+
+#include "config.h"
+#undef dout
+#define dout(x)  if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
+#define derr(x)  if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
+
+
+// messages ------------------------------
+
+void Objecter::dispatch(Message *m)
+{
+  switch (m->get_type()) {
+  case MSG_OSD_OPREPLY:
+    handle_osd_op_reply((MOSDOpReply*)m);
+    break;
+    
+  case MSG_OSD_MAP:
+    handle_osd_map((MOSDMap*)m);
+    break;
+
+  default:
+    dout(1) << "don't know message type " << m->get_type() << endl;
+    assert(0);
+  }
+}
+
+void Objecter::handle_osd_map(MOSDMap *m)
+{
+  assert(osdmap); 
+
+  if (m->get_last() <= osdmap->get_epoch()) {
+    dout(3) << "handle_osd_map ignoring epochs [" 
+            << m->get_first() << "," << m->get_last() 
+            << "] <= " << osdmap->get_epoch() << endl;
+  } 
+  else {
+    dout(3) << "handle_osd_map got epochs [" 
+            << m->get_first() << "," << m->get_last() 
+            << "] > " << osdmap->get_epoch()
+            << endl;
+
+    set<pg_t> changed_pgs;
+
+    for (epoch_t e = osdmap->get_epoch() + 1;
+         e <= m->get_last();
+         e++) {
+      if (m->incremental_maps.count(e)) {
+        dout(3) << "handle_osd_map decoding incremental epoch " << e << endl;
+        OSDMap::Incremental inc;
+        int off = 0;
+        inc.decode(m->incremental_maps[e], off);
+        osdmap->apply_incremental(inc);
+    
+        // notify messenger
+        for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+             i != inc.new_down.end();
+             i++) 
+          messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+        for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+             i != inc.new_up.end();
+             i++) 
+          messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+        
+      }
+      else if (m->maps.count(e)) {
+        dout(3) << "handle_osd_map decoding full epoch " << e << endl;
+        osdmap->decode(m->maps[e]);
+      }
+      else {
+        dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl;
+        int mon = monmap->pick_mon();
+        messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), 
+                                MSG_ADDR_MON(mon), monmap->get_inst(mon));
+        break;
+      }
+      
+      // scan pgs for changes
+      scan_pgs(changed_pgs);
+        
+      assert(e == osdmap->get_epoch());
+    }
+
+    // kick requests who might be timing out on the wrong osds
+    if (!changed_pgs.empty())
+      kick_requests(changed_pgs);
+  }
+  
+  delete m;
+}
+
+void Objecter::scan_pgs(set<pg_t>& changed_pgs)
+{
+  dout(10) << "scan_pgs" << endl;
+
+  for (hash_map<pg_t,PG>::iterator i = pg_map.begin();
+       i != pg_map.end();
+       i++) {
+    pg_t pgid = i->first;
+    PG& pg = i->second;
+    
+    // calc new.
+    vector<int> other;
+    osdmap->pg_to_acting_osds(pgid, other);
+
+    if (other == pg.acting) 
+      continue; // no change.
+    
+    other.swap(pg.acting);
+
+    if (g_conf.osd_rep == OSD_REP_PRIMARY) {
+      // same primary?
+      if (!other.empty() &&
+          !pg.acting.empty() &&
+          other[0] == pg.acting[0]) 
+        continue;
+    }
+    else if (g_conf.osd_rep == OSD_REP_SPLAY) {
+      // same primary and acker?
+      if (!other.empty() &&
+          !pg.acting.empty() &&
+          other[0] == pg.acting[0] &&
+          other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) 
+        continue;
+    }
+    else if (g_conf.osd_rep == OSD_REP_CHAIN) {
+      // any change is significant.
+    }
+    
+    // changed significantly.
+    dout(10) << "scan_pgs pg " << pgid 
+             << " (" << pg.active_tids << ")"
+             << " " << other << " -> " << pg.acting
+             << endl;
+    changed_pgs.insert(pgid);
+  }
+}
+
+void Objecter::kick_requests(set<pg_t>& changed_pgs) 
+{
+  dout(10) << "kick_requests in pgs " << changed_pgs << endl;
+
+  for (set<pg_t>::iterator i = changed_pgs.begin();
+       i != changed_pgs.end();
+       i++) {
+    pg_t pgid = *i;
+    PG& pg = pg_map[pgid];
+
+    // resubmit ops!
+    set<tid_t> tids;
+    tids.swap( pg.active_tids );
+    close_pg( pgid );  // will pbly reopen, unless it's just commits we're missing
+    
+    for (set<tid_t>::iterator p = tids.begin();
+         p != tids.end();
+         p++) {
+      tid_t tid = *p;
+      
+      if (op_modify.count(tid)) {
+        OSDModify *wr = op_modify[tid];
+        op_modify.erase(tid);
+        
+        // WRITE
+        if (wr->tid_version.count(tid)) {
+          if (wr->op == OSD_OP_WRITE &&
+              !g_conf.objecter_buffer_uncommitted) {
+            dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl;
+          } else {
+            dout(0) << "kick_requests missing commit, replay write " << tid
+                    << " v " << wr->tid_version[tid] << endl;
+            modifyx_submit(wr, wr->waitfor_commit[tid], tid);
+          }
+        } 
+        else if (wr->waitfor_ack.count(tid)) {
+          dout(0) << "kick_requests missing ack, resub write " << tid << endl;
+          modifyx_submit(wr, wr->waitfor_ack[tid], tid);
+        }
+      }
+
+      else if (op_read.count(tid)) {
+        // READ
+        OSDRead *rd = op_read[tid];
+        op_read.erase(tid);
+        dout(0) << "kick_requests resub read " << tid << endl;
+
+        // resubmit
+        readx_submit(rd, rd->ops[tid]);
+        rd->ops.erase(tid);
+      }
+
+	  else if (op_stat.count(tid)) {
+		OSDStat *st = op_stat[tid];
+		op_stat.erase(tid);
+		
+		dout(0) << "kick_requests resub stat " << tid << endl;
+		
+        // resubmit
+        stat_submit(st);
+	  }
+	  
+      else 
+        assert(0);
+    }         
+  }         
+}
+
+
+
+void Objecter::handle_osd_op_reply(MOSDOpReply *m)
+{
+  // read or modify?
+  switch (m->get_op()) {
+  case OSD_OP_READ:
+    handle_osd_read_reply(m);
+    break;
+
+  case OSD_OP_STAT:
+	handle_osd_stat_reply(m);
+	break;
+    
+  case OSD_OP_WRNOOP:
+  case OSD_OP_WRITE:
+  case OSD_OP_ZERO:
+  case OSD_OP_DELETE:
+  case OSD_OP_WRUNLOCK:
+  case OSD_OP_WRLOCK:
+  case OSD_OP_RDLOCK:
+  case OSD_OP_RDUNLOCK:
+  case OSD_OP_UPLOCK:
+  case OSD_OP_DNLOCK:
+    handle_osd_modify_reply(m);
+    break;
+
+  default:
+    assert(0);
+  }
+}
+
+
+
+// stat -----------------------------------
+
+tid_t Objecter::stat(object_t oid, off_t *size, Context *onfinish,
+					 objectrev_t rev)
+{
+  OSDStat *st = new OSDStat(size);
+  st->extents.push_back(ObjectExtent(oid, 0, 0));
+  st->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+  st->extents.front().rev = rev;
+  st->onfinish = onfinish;
+
+  return stat_submit(st);
+}
+
+tid_t Objecter::stat_submit(OSDStat *st) 
+{
+  // find OSD
+  ObjectExtent &ex = st->extents.front();
+  PG &pg = get_pg( ex.pgid );
+
+  // send
+  last_tid++;
+  MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+                         ex.oid, ex.pgid, osdmap->get_epoch(), 
+                         OSD_OP_STAT);
+  dout(10) << "stat_submit " << st << " tid " << last_tid
+           << " oid " << ex.oid
+           << " pg " << ex.pgid
+           << " osd" << pg.acker() 
+           << endl;
+
+  if (pg.acker() >= 0) 
+    messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+  
+  // add to gather set
+  st->tid = last_tid;
+  op_stat[last_tid] = st;    
+
+  pg.active_tids.insert(last_tid);
+
+  return last_tid;
+}
+
+void Objecter::handle_osd_stat_reply(MOSDOpReply *m)
+{
+  // get pio
+  tid_t tid = m->get_tid();
+
+  if (op_stat.count(tid) == 0) {
+    dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl;
+    delete m;
+    return;
+  }
+
+  dout(7) << "handle_osd_stat_reply " << tid 
+		  << " r=" << m->get_result()
+		  << " size=" << m->get_object_size()
+		  << endl;
+  OSDStat *st = op_stat[ tid ];
+  op_stat.erase( tid );
+
+  // remove from osd/tid maps
+  PG& pg = get_pg( m->get_pg() );
+  assert(pg.active_tids.count(tid));
+  pg.active_tids.erase(tid);
+  if (pg.active_tids.empty()) close_pg( m->get_pg() );
+  
+  // success?
+  if (m->get_result() == -EAGAIN) {
+    dout(7) << " got -EAGAIN, resubmitting" << endl;
+    stat_submit(st);
+    delete m;
+    return;
+  }
+  //assert(m->get_result() >= 0);
+
+  // ok!
+  if (m->get_result() < 0) {
+	*st->size = -1;
+  } else {
+	*st->size = m->get_object_size();
+  }
+
+  // finish, clean up
+  Context *onfinish = st->onfinish;
+
+  // done
+  delete st;
+  if (onfinish) {
+	onfinish->finish(m->get_result());
+	delete onfinish;
+  }
+
+  delete m;
+}
+
+
+// read -----------------------------------
+
+
+tid_t Objecter::read(object_t oid, off_t off, size_t len, bufferlist *bl, 
+                     Context *onfinish, 
+					 objectrev_t rev)
+{
+  OSDRead *rd = new OSDRead(bl);
+  rd->extents.push_back(ObjectExtent(oid, off, len));
+  rd->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+  rd->extents.front().rev = rev;
+  readx(rd, onfinish);
+  return last_tid;
+}
+
+
+tid_t Objecter::readx(OSDRead *rd, Context *onfinish)
+{
+  rd->onfinish = onfinish;
+  
+  // issue reads
+  for (list<ObjectExtent>::iterator it = rd->extents.begin();
+       it != rd->extents.end();
+       it++) 
+    readx_submit(rd, *it);
+
+  return last_tid;
+}
+
+tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) 
+{
+  // find OSD
+  PG &pg = get_pg( ex.pgid );
+
+  // send
+  last_tid++;
+  MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+                         ex.oid, ex.pgid, osdmap->get_epoch(), 
+                         OSD_OP_READ);
+  m->set_length(ex.length);
+  m->set_offset(ex.start);
+  dout(10) << "readx_submit " << rd << " tid " << last_tid
+           << " oid " << ex.oid << " " << ex.start << "~" << ex.length
+           << " (" << ex.buffer_extents.size() << " buffer fragments)" 
+           << " pg " << ex.pgid
+           << " osd" << pg.acker() 
+           << endl;
+
+  if (pg.acker() >= 0) 
+    messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+    
+  // add to gather set
+  rd->ops[last_tid] = ex;
+  op_read[last_tid] = rd;    
+
+  pg.active_tids.insert(last_tid);
+
+  return last_tid;
+}
+
+
+void Objecter::handle_osd_read_reply(MOSDOpReply *m) 
+{
+  // get pio
+  tid_t tid = m->get_tid();
+
+  if (op_read.count(tid) == 0) {
+    dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl;
+    delete m;
+    return;
+  }
+
+  dout(7) << "handle_osd_read_reply " << tid << endl;
+  OSDRead *rd = op_read[ tid ];
+  op_read.erase( tid );
+
+  // remove from osd/tid maps
+  PG& pg = get_pg( m->get_pg() );
+  assert(pg.active_tids.count(tid));
+  pg.active_tids.erase(tid);
+  if (pg.active_tids.empty()) close_pg( m->get_pg() );
+  
+  // our op finished
+  rd->ops.erase(tid);
+
+  // success?
+  if (m->get_result() == -EAGAIN) {
+    dout(7) << " got -EAGAIN, resubmitting" << endl;
+    readx_submit(rd, rd->ops[tid]);
+    delete m;
+    return;
+  }
+  //assert(m->get_result() >= 0);
+
+  // what buffer offset are we?
+  dout(7) << " got frag from " << m->get_oid() << " "
+          << m->get_offset() << "~" << m->get_length()
+          << ", still have " << rd->ops.size() << " more ops" << endl;
+  
+  if (rd->ops.empty()) {
+    // all done
+    size_t bytes_read = 0;
+    
+    if (rd->read_data.size()) {
+      dout(15) << " assembling frags" << endl;
+
+      /** FIXME This doesn't handle holes efficiently.
+       * It allocates zero buffers to fill whole buffer, and
+       * then discards trailing ones at the end.
+       *
+       * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over
+       * the heap. 
+       */
+
+      // we have other fragments, assemble them all... blech!
+      rd->read_data[m->get_oid()] = new bufferlist;
+      rd->read_data[m->get_oid()]->claim( m->get_data() );
+
+      // map extents back into buffer
+      map<off_t, bufferlist*> by_off;  // buffer offset -> bufferlist
+
+      // for each object extent...
+      for (list<ObjectExtent>::iterator eit = rd->extents.begin();
+           eit != rd->extents.end();
+           eit++) {
+        bufferlist *ox_buf = rd->read_data[eit->oid];
+        unsigned ox_len = ox_buf->length();
+        unsigned ox_off = 0;
+        assert(ox_len <= eit->length);           
+
+        // for each buffer extent we're mapping into...
+        for (map<size_t,size_t>::iterator bit = eit->buffer_extents.begin();
+             bit != eit->buffer_extents.end();
+             bit++) {
+          dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl;
+          by_off[bit->first] = new bufferlist;
+
+          if (ox_off + bit->second <= ox_len) {
+            // we got the whole bx
+            by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second);
+            if (bytes_read < bit->first + bit->second) 
+              bytes_read = bit->first + bit->second;
+          } else if (ox_off + bit->second > ox_len && ox_off < ox_len) {
+            // we got part of this bx
+            by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off));
+            if (bytes_read < bit->first + ox_len-ox_off) 
+              bytes_read = bit->first + ox_len-ox_off;
+
+            // zero end of bx
+            dout(21) << "  adding some zeros to the end " << ox_off + bit->second-ox_len << endl;
+            bufferptr z(ox_off + bit->second - ox_len);
+			z.zero();
+            by_off[bit->first]->append( z );
+          } else {
+            // we got none of this bx.  zero whole thing.
+            assert(ox_off >= ox_len);
+            dout(21) << "  adding all zeros for this bit " << bit->second << endl;
+            bufferptr z(bit->second);
+			z.zero();
+            by_off[bit->first]->append( z );
+          }
+          ox_off += bit->second;
+        }
+        assert(ox_off == eit->length);
+      }
+
+      // sort and string bits together
+      for (map<off_t, bufferlist*>::iterator it = by_off.begin();
+           it != by_off.end();
+           it++) {
+        assert(it->second->length());
+        if (it->first < (off_t)bytes_read) {
+          dout(21) << "  concat buffer frag off " << it->first << " len " << it->second->length() << endl;
+          rd->bl->claim_append(*(it->second));
+        } else {
+          dout(21) << "  NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl;          
+        }
+        delete it->second;
+      }
+
+      // trim trailing zeros?
+      if (rd->bl->length() > bytes_read) {
+        dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read 
+                 << " len=" << rd->bl->length() << endl;
+        rd->bl->splice(bytes_read, rd->bl->length() - bytes_read);
+        assert(bytes_read == rd->bl->length());
+      }
+      
+      // hose p->read_data bufferlist*'s
+      for (map<object_t, bufferlist*>::iterator it = rd->read_data.begin();
+           it != rd->read_data.end();
+           it++) {
+        delete it->second;
+      }
+    } else {
+      dout(15) << "  only one frag" << endl;
+
+      // only one fragment, easy
+      rd->bl->claim( m->get_data() );
+      bytes_read = rd->bl->length();
+    }
+
+    // finish, clean up
+    Context *onfinish = rd->onfinish;
+
+    dout(7) << " " << bytes_read << " bytes " 
+            << rd->bl->length()
+            << endl;
+    
+    // done
+    delete rd;
+    if (onfinish) {
+      onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result());
+      delete onfinish;
+    }
+  } else {
+    // store my bufferlist for later assembling
+    rd->read_data[m->get_oid()] = new bufferlist;
+    rd->read_data[m->get_oid()]->claim( m->get_data() );
+  }
+
+  delete m;
+}
+
+
+
+// write ------------------------------------
+
+tid_t Objecter::write(object_t oid, off_t off, size_t len, bufferlist &bl, 
+                      Context *onack, Context *oncommit,
+					  objectrev_t rev)
+{
+  OSDWrite *wr = new OSDWrite(bl);
+  wr->extents.push_back(ObjectExtent(oid, off, len));
+  wr->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+  wr->extents.front().buffer_extents[0] = len;
+  wr->extents.front().rev = rev;
+  modifyx(wr, onack, oncommit);
+  return last_tid;
+}
+
+
+// zero
+
+tid_t Objecter::zero(object_t oid, off_t off, size_t len,  
+                     Context *onack, Context *oncommit,
+					 objectrev_t rev)
+{
+  OSDModify *z = new OSDModify(OSD_OP_ZERO);
+  z->extents.push_back(ObjectExtent(oid, off, len));
+  z->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+  z->extents.front().rev = rev;
+  modifyx(z, onack, oncommit);
+  return last_tid;
+}
+
+
+// lock ops
+
+tid_t Objecter::lock(int op, object_t oid, 
+                     Context *onack, Context *oncommit)
+{
+  OSDModify *l = new OSDModify(op);
+  l->extents.push_back(ObjectExtent(oid, 0, 0));
+  l->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+  modifyx(l, onack, oncommit);
+  return last_tid;
+}
+
+
+
+// generic modify -----------------------------------
+
+tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit)
+{
+  wr->onack = onack;
+  wr->oncommit = oncommit;
+
+  // issue writes/whatevers
+  for (list<ObjectExtent>::iterator it = wr->extents.begin();
+       it != wr->extents.end();
+       it++) 
+    modifyx_submit(wr, *it);
+
+  return last_tid;
+}
+
+
+tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid)
+{
+  // find
+  PG &pg = get_pg( ex.pgid );
+    
+  // send
+  tid_t tid;
+  if (usetid > 0) 
+    tid = usetid;
+  else
+    tid = ++last_tid;
+
+  MOSDOp *m = new MOSDOp(tid, messenger->get_myaddr(),
+                         ex.oid, ex.pgid, osdmap->get_epoch(),
+                         wr->op);
+  m->set_length(ex.length);
+  m->set_offset(ex.start);
+  m->set_rev(ex.rev);
+
+  if (wr->tid_version.count(tid)) 
+    m->set_version(wr->tid_version[tid]);  // we're replaying this op!
+    
+  // what type of op?
+  switch (wr->op) {
+  case OSD_OP_WRITE:
+    {
+      // map buffer segments into this extent
+      // (may be fragmented bc of striping)
+      bufferlist cur;
+      for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
+           bit != ex.buffer_extents.end();
+           bit++) {
+        bufferlist thisbit;
+        thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
+        cur.claim_append(thisbit);
+      }
+      assert(cur.length() == ex.length);
+      m->set_data(cur);//.claim(cur);
+    }
+    break;
+  }
+
+  // add to gather set
+  wr->waitfor_ack[tid] = ex;
+  wr->waitfor_commit[tid] = ex;
+  op_modify[tid] = wr;
+  pg.active_tids.insert(tid);
+  
+  ++num_unacked;
+  ++num_uncommitted;
+
+  // send
+  dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid
+           << "  oid " << ex.oid
+           << " " << ex.start << "~" << ex.length 
+           << " pg " << ex.pgid 
+           << " osd" << pg.primary()
+           << endl;
+  if (pg.primary() >= 0)
+    messenger->send_message(m, MSG_ADDR_OSD(pg.primary()), osdmap->get_inst(pg.primary()));
+  
+  dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl;
+  
+  return tid;
+}
+
+
+
+void Objecter::handle_osd_modify_reply(MOSDOpReply *m)
+{
+  // get pio
+  tid_t tid = m->get_tid();
+
+  if (op_modify.count(tid) == 0) {
+    dout(7) << "handle_osd_modify_reply " << tid 
+            << (m->get_commit() ? " commit":" ack")
+            << " ... stray" << endl;
+    delete m;
+    return;
+  }
+
+  dout(7) << "handle_osd_modify_reply " << tid 
+          << (m->get_commit() ? " commit":" ack")
+          << " v " << m->get_version()
+          << endl;
+  OSDModify *wr = op_modify[ tid ];
+
+  Context *onack = 0;
+  Context *oncommit = 0;
+
+  PG &pg = get_pg( m->get_pg() );
+
+  // ignore?
+  if (pg.acker() != m->get_source().num()) {
+    dout(7) << " ignoring ack|commit from non-acker" << endl;
+    delete m;
+    return;
+  }
+
+  assert(m->get_result() >= 0);
+
+  // ack or commit?
+  if (m->get_commit()) {
+    //dout(15) << " handle_osd_write_reply commit on " << tid << endl;
+    assert(wr->tid_version.count(tid) == 0 ||
+           m->get_version() == wr->tid_version[tid]);
+
+    // remove from tid/osd maps
+    assert(pg.active_tids.count(tid));
+    pg.active_tids.erase(tid);
+    if (pg.active_tids.empty()) close_pg( m->get_pg() );
+
+    // commit.
+    op_modify.erase( tid );
+    wr->waitfor_ack.erase(tid);
+    wr->waitfor_commit.erase(tid);
+
+    num_uncommitted--;
+
+    if (wr->waitfor_commit.empty()) {
+      onack = wr->onack;
+      oncommit = wr->oncommit;
+      delete wr;
+    }
+  } else {
+    // ack.
+    //dout(15) << " handle_osd_write_reply ack on " << tid << endl;
+    assert(wr->waitfor_ack.count(tid));
+    wr->waitfor_ack.erase(tid);
+    
+    num_unacked--;
+
+    if (wr->tid_version.count(tid) &&
+        wr->tid_version[tid].version != m->get_version().version) {
+      dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid 
+                << " did not achieve previous ordering" << endl;
+    }
+    wr->tid_version[tid] = m->get_version();
+    
+    if (wr->waitfor_ack.empty()) {
+      onack = wr->onack;
+      wr->onack = 0;  // only do callback once
+      
+      // buffer uncommitted?
+      if (!g_conf.objecter_buffer_uncommitted &&
+          wr->op == OSD_OP_WRITE) {
+        // discard buffer!
+        ((OSDWrite*)wr)->bl.clear();
+      }
+    }
+  }
+  
+  // do callbacks
+  if (onack) {
+    onack->finish(0);
+    delete onack;
+  }
+  if (oncommit) {
+    oncommit->finish(0);
+    delete oncommit;
+  }
+
+  delete m;
+}
+
+
+
+void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+  if (dest.is_mon()) {
+    // try a new mon
+    int mon = monmap->pick_mon(true);
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", resending to mon" << mon 
+            << endl;
+    messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+  } 
+  else if (dest.is_osd()) {
+    int mon = monmap->pick_mon();
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", dropping and reporting to mon" << mon 
+            << endl;
+    messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()), 
+                            MSG_ADDR_MON(mon), monmap->get_inst(mon));
+    delete m;
+  } else {
+    dout(0) << "ms_handle_failure " << dest << " inst " << inst 
+            << ", dropping" << endl;
+    delete m;
+  }
+}
diff --git a/branches/sage/cephmds2/osdc/Objecter.h b/branches/sage/cephmds2/osdc/Objecter.h
new file mode 100644
index 0000000000000..72e637789f988
--- /dev/null
+++ b/branches/sage/cephmds2/osdc/Objecter.h
@@ -0,0 +1,191 @@
+#ifndef __OBJECTER_H
+#define __OBJECTER_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+#include "osd/OSDMap.h"
+#include "messages/MOSDOp.h"
+
+#include <list>
+#include <map>
+#include <ext/hash_map>
+using namespace std;
+using namespace __gnu_cxx;
+
+class Context;
+class Messenger;
+class OSDMap;
+class MonMap;
+class Message;
+
+class Objecter {
+ public:  
+  Messenger *messenger;
+  MonMap    *monmap;
+  OSDMap    *osdmap;
+  
+ private:
+  tid_t last_tid;
+  int num_unacked;
+  int num_uncommitted;
+
+  /*** track pending operations ***/
+  // read
+ public:
+  class OSDOp {
+  public:
+    list<ObjectExtent> extents;
+    virtual ~OSDOp() {}
+  };
+
+  class OSDRead : public OSDOp {
+  public:
+    bufferlist *bl;
+    Context *onfinish;
+    map<tid_t, ObjectExtent> ops;
+    map<object_t, bufferlist*> read_data;  // bits of data as they come back
+
+    OSDRead(bufferlist *b) : bl(b), onfinish(0) {
+      bl->clear();
+    }
+  };
+
+  class OSDStat : public OSDOp {
+  public:
+	tid_t tid;
+	off_t *size;  // where the size goes.
+    Context *onfinish;
+	OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { }
+  };
+
+  // generic modify
+  class OSDModify : public OSDOp {
+  public:
+    int op;
+    list<ObjectExtent> extents;
+    Context *onack;
+    Context *oncommit;
+    map<tid_t, ObjectExtent> waitfor_ack;
+    map<tid_t, eversion_t>   tid_version;
+    map<tid_t, ObjectExtent> waitfor_commit;
+
+    OSDModify(int o) : op(o), onack(0), oncommit(0) {}
+  };
+  
+  // write (includes the bufferlist)
+  class OSDWrite : public OSDModify {
+  public:
+    bufferlist bl;
+    OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {}
+  };
+
+  
+
+ private:
+  // pending ops
+  hash_map<tid_t,OSDStat*>   op_stat;
+  hash_map<tid_t,OSDRead*>   op_read;
+  hash_map<tid_t,OSDModify*> op_modify;
+
+  /**
+   * track pending ops by pg
+   *  ...so we can cope with failures, map changes
+   */
+  class PG {
+  public:
+    vector<int> acting;
+    set<tid_t>  active_tids; // active ops
+    
+    PG() {}
+    
+    // primary - where i write
+    int primary() {
+      if (acting.empty()) return -1;
+      return acting[0];
+    }
+    // acker - where i read, and receive acks from
+    int acker() {
+      if (acting.empty()) return -1;
+      if (g_conf.osd_rep == OSD_REP_PRIMARY)
+        return acting[0];
+      else
+        return acting[acting.size() > 1 ? 1:0];
+    }
+  };
+
+  hash_map<pg_t,PG> pg_map;
+  
+  
+  PG &get_pg(pg_t pgid) {
+    if (!pg_map.count(pgid)) 
+      osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting);
+    return pg_map[pgid];
+  }
+  void close_pg(pg_t pgid) {
+    assert(pg_map.count(pgid));
+    assert(pg_map[pgid].active_tids.empty());
+    pg_map.erase(pgid);
+  }
+  void scan_pgs(set<pg_t>& chnaged_pgs);
+  void kick_requests(set<pg_t>& changed_pgs);
+    
+
+ public:
+  Objecter(Messenger *m, MonMap *mm, OSDMap *om) : 
+    messenger(m), monmap(mm), osdmap(om),
+    last_tid(0),
+    num_unacked(0), num_uncommitted(0)
+    {}
+  ~Objecter() {
+    // clean up op_*
+    // ***
+  }
+
+  // messages
+ public:
+  void dispatch(Message *m);
+  void handle_osd_op_reply(class MOSDOpReply *m);
+  void handle_osd_stat_reply(class MOSDOpReply *m);
+  void handle_osd_read_reply(class MOSDOpReply *m);
+  void handle_osd_modify_reply(class MOSDOpReply *m);
+  void handle_osd_lock_reply(class MOSDOpReply *m);
+  void handle_osd_map(class MOSDMap *m);
+
+ private:
+  tid_t readx_submit(OSDRead *rd, ObjectExtent& ex);
+  tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0);
+  tid_t stat_submit(OSDStat *st);
+
+  // public interface
+ public:
+  bool is_active() {
+    return !(op_read.empty() && op_modify.empty());
+  }
+
+  // med level
+  tid_t readx(OSDRead *read, Context *onfinish);
+  tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit);
+  //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit);
+
+  // even lazier
+  tid_t read(object_t oid, off_t off, size_t len, bufferlist *bl, 
+             Context *onfinish, 
+			 objectrev_t rev=0);
+  tid_t write(object_t oid, off_t off, size_t len, bufferlist &bl, 
+              Context *onack, Context *oncommit, 
+			  objectrev_t rev=0);
+  tid_t zero(object_t oid, off_t off, size_t len,  
+             Context *onack, Context *oncommit, 
+			 objectrev_t rev=0);
+  tid_t stat(object_t oid, off_t *size, Context *onfinish, 
+			 objectrev_t rev=0);  
+
+  tid_t lock(int op, object_t oid, Context *onack, Context *oncommit);
+
+
+  void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+
+};
+
+#endif
diff --git a/branches/sage/cephmds2/script/add_header.pl b/branches/sage/cephmds2/script/add_header.pl
new file mode 100755
index 0000000000000..f5891cc668c45
--- /dev/null
+++ b/branches/sage/cephmds2/script/add_header.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use strict;
+my $fn = shift @ARGV;
+my $f = `cat $fn`;
+
+my $header = '// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+';
+
+unless ($f =~ /Ceph - scalable distributed file system/) {
+	open(O, ">$fn.new");
+	print O $header;
+	print O $f;
+	close O;
+	rename "$fn.new", $fn;
+}
+
diff --git a/branches/sage/cephmds2/script/adjusttabs.pl b/branches/sage/cephmds2/script/adjusttabs.pl
new file mode 100755
index 0000000000000..66edff2ac6c02
--- /dev/null
+++ b/branches/sage/cephmds2/script/adjusttabs.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl
+
+my $tablen = shift @ARGV;
+my $fn = shift @ARGV;
+
+my $tab = ' ' x $tablen;
+open(I, $fn);
+my $f;
+my $oldtab = ' ' x 4;
+while (<I>) {
+	if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) {
+		print "old length was $oldlen\n";
+		$oldtab = ' ' x $oldlen;
+		s/tab-width:\d/tab-width:$tablen/;
+	}
+	s/\t/$oldtab/g;
+	$f .= $_;
+}
+close I;
+open(O, ">$fn.new");
+print O $f;
+close O;
+
+rename "$fn.new", $fn;
diff --git a/branches/sage/cephmds2/script/clean_osd_cow.sh b/branches/sage/cephmds2/script/clean_osd_cow.sh
new file mode 100755
index 0000000000000..1e443c95e7ebc
--- /dev/null
+++ b/branches/sage/cephmds2/script/clean_osd_cow.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+rm osddata/*/*\.*
diff --git a/branches/sage/cephmds2/script/clean_trace.pl b/branches/sage/cephmds2/script/clean_trace.pl
new file mode 100755
index 0000000000000..cb02ff7abe7c2
--- /dev/null
+++ b/branches/sage/cephmds2/script/clean_trace.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+my $n = 0;
+while (<>) {
+	next unless /trace: /;
+	my $l = $';  $';
+	print $l;
+}
diff --git a/branches/sage/cephmds2/script/comb.pl b/branches/sage/cephmds2/script/comb.pl
new file mode 100755
index 0000000000000..88a4bb72a7970
--- /dev/null
+++ b/branches/sage/cephmds2/script/comb.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $xaxis = shift @ARGV;
+my @vars;
+while (@ARGV) {
+	$_ = shift @ARGV;
+	last if ($_ eq '-');
+	push(@vars, $_);
+}
+my @dirs;
+while (@ARGV) {
+	$_ = shift @ARGV;
+	last if ($_ eq '-');
+	push(@dirs, $_) if -d $_;
+}
+my @filt = @ARGV;
+push( @filt, '.' ) unless @filt;
+
+print "#xaxis $xaxis
+#vars @vars
+#dirs @dirs
+#filt @filt
+";
+
+sub load_sum {
+	my $fn = shift @_;
+
+	open(I, "$fn");
+	my $k = <I>;
+	chomp($k);
+	my @k = split(/\s+/,$k);
+	shift @k;
+
+	my $s;
+	while (<I>) {
+		chomp;
+		s/^\#//;
+		next unless $_;
+		my @l = split(/\s+/,$_);
+		my $k = shift @l;
+		for my $f (@k) {
+			$s->{$k}->{$f} = shift @l;
+		}
+
+		# clnode latency?
+		if ($fn =~ /cl/) {
+			$s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0;
+			$s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0;
+			$s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0;
+			$s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0;
+			$s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0;
+			$s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0;
+			$s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0;
+		}
+	}		
+	return $s;
+}
+
+
+my %res;
+my @key;
+my %didkey;
+for my $f (@filt) {
+	my @reg = split(/,/, $f);
+	#print "reg @reg\n";
+   	for my $d (@dirs) {
+		if ($f ne '.') {
+			my $r = (split(/\//,$d))[-1];
+			my @db = split(/,/, $r);
+			#print "db @db\n";
+			my $ok = 1;
+			for my $r (@reg) {
+				
+				$ok = 0 unless grep {$_ eq $r} @db;
+			}
+			next unless $ok;
+		}
+		#next if ($f ne '.' && $d !~ /$reg/);			
+		#print "$d\n";
+		my ($x) = $d =~ /$xaxis=(\d+)/;
+		
+		for my $v (@vars) {
+			my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/;
+			#print "$what $field .. $v  .. $f.$field\n";
+			my $s = &load_sum("$d/sum.$what");
+			
+			#print "\t$v";
+			if ($field =~ /^sum=/) {
+				#warn "SUM field $field\n";
+				push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'});
+			} else {
+				#warn "avg field $field\n";
+				push( @{$res{$x}}, $s->{'avgval'}->{$field} );
+			}
+
+			push( @key, "$f.$field" ) unless $didkey{"$f.$field"};
+			$didkey{"$f.$field"} = 1;
+
+			if (0 && exists $s->{'avgvaldevt'}) {
+				push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} );
+				push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"};
+				$didkey{"$f.$field.dev"} = 1;
+			}
+		}
+	}
+}
+
+print join("\t", "#", @key) . "\n";
+for my $x (sort {$a <=> $b} keys %res) {
+	print join("\t", $x, @{$res{$x}}) . "\n";
+}
diff --git a/branches/sage/cephmds2/script/find_auth_pins.pl b/branches/sage/cephmds2/script/find_auth_pins.pl
new file mode 100755
index 0000000000000..c02c12922ed7b
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_auth_pins.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+my %pin;
+my %hist;     
+my $l = 1;
+my @pins;
+while (<>) {
+
+	#cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1
+
+	if (/adjust_nested_auth_pins/) {
+		my ($what) = /\[(\w+ \d+) /;
+		$hist{$what} .= "$l: $_"
+			if defined $pin{$what};
+	}
+
+	# cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+	if (/auth_pin /) {
+		my ($what) = /\[(\w+ \d+) /;
+#		print "add_waiter $c $what\n";
+		$pin{$what}++;
+		$hist{$what} .= "$l: $_";
+		push( @pins, $what ) unless grep {$_ eq $what} @pins;
+	}
+
+	# cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0
+
+	if (/auth_unpin/) {
+		my ($what) = /\[(\w+ \d+) /;# / on (.*\])/;
+		$pin{$what}--;
+		$hist{$what} .= "$l: $_";
+		unless ($pin{$what}) {
+			delete $hist{$what};
+			delete $pin{$what};
+			@pins = grep {$_ ne $what} @pins;
+		}
+	}
+	$l++;
+}
+
+for my $what (@pins) {
+	print "---- count $pin{$what} on $what
+$hist{$what}
+";
+}
diff --git a/branches/sage/cephmds2/script/find_bufferleaks.pl b/branches/sage/cephmds2/script/find_bufferleaks.pl
new file mode 100755
index 0000000000000..152515d5e788e
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_bufferleaks.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl
+
+use strict;
+my %buffers;
+my %bufferlists;
+my %ref;
+my %mal;
+my $l = 1;
+while (<>) {
+	#print "$l: $_";
+
+	# cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+	if (/^buffer\.cons /) {
+		my ($x) = /(0x\S+)/;
+		$buffers{$x} = 1;
+	}
+	if (/^buffer\.des /) {
+		my ($x) = /(0x\S+)/;
+		die "des without cons at $l: $_" unless $buffers{$x};
+		delete $buffers{$x};
+		die "des with ref>0 at $l: $_" unless $ref{$x} == 0;
+		delete $ref{$x};
+	}
+
+	if (/^bufferlist\.cons /) {
+		my ($x) = /(0x\S+)/;
+		$bufferlists{$x} = 1;
+	}
+	if (/^bufferlist\.des /) {
+		my ($x) = /(0x\S+)/;
+		warn "des without cons at $l: $_" unless $bufferlists{$x};
+		delete $bufferlists{$x};
+	}
+
+
+	if (/^buffer\.malloc /) {
+		my ($x) = /(0x\S+)/;
+		$mal{$x} = 1;
+	}
+	if (/^buffer\.free /) {
+		my ($x) = /(0x\S+)/;
+		die "free with malloc at $l: $_" unless $mal{$x};
+		delete $mal{$x};
+	}
+
+	if (/^buffer\.get /) {
+		my ($x) = /(0x\S+)/;
+		$ref{$x}++;
+	}
+	if (/^buffer\.get /) {
+		my ($x) = /(0x\S+)/;
+		$ref{$x}--;
+	}
+
+$l++;
+}
+
+for my $x (keys %bufferlists) {
+	print "leaked bufferlist $x\n";
+}
+
+for my $x (keys %buffers) {
+	print "leaked buffer $x ref $ref{$x}\n";
+}
+
+for my $x (keys %mal) {
+	print "leaked buffer dataptr $x ref $ref{$x}\n";
+}
diff --git a/branches/sage/cephmds2/script/find_lost_bdev_ops.pl b/branches/sage/cephmds2/script/find_lost_bdev_ops.pl
new file mode 100755
index 0000000000000..ac1793b42dfac
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_lost_bdev_ops.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+my %op;
+
+my $line = 0;
+while (<>) {
+	#print $line . $_ if /0x8d4f6a0/;
+	chomp;
+	$line++;
+
+	#bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0)
+	if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) {
+		$op{$bio} = $line;
+	}
+
+	# cancel
+	#bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8)
+	if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ &&
+		!(/FAILED/)) {
+		delete $op{$bio};
+	}
+	
+	# finish
+	#bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8)
+	if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) {
+		delete $op{$bio};
+	}
+	
+}
+
+for my $bio (keys %op) {
+	print "---- lost bio $bio\n";
+}
diff --git a/branches/sage/cephmds2/script/find_lost_commit.pl b/branches/sage/cephmds2/script/find_lost_commit.pl
new file mode 100755
index 0000000000000..73934248ad5c0
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_lost_commit.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl
+
+use strict;
+my %op;
+
+my $line = 0;
+while (<>) {
+	#print "$line: $_";
+	$line++;
+
+	#osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]]
+	if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) {
+#		print "$op\n";
+		if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) {
+			$op{$op} = $from;
+		}
+	}
+
+	# commits
+	#osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788)
+	if (my ($op) = /op_modify_commit.* (\w+)\)/) {
+		delete $op{$op};
+	}
+	#osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980)
+	if (my ($op) = /rep_modify_commit.* (\w+)\)/) {
+		delete $op{$op};
+	}
+
+	# forwarded?
+	if (my ($op) = /sending (\w+) to osd/) {
+		delete $op{$op};
+	}
+
+}
+
+for my $op (keys %op) {
+	print "---- lost op $op $op{$op}\n";
+}
diff --git a/branches/sage/cephmds2/script/find_lost_objecter.pl b/branches/sage/cephmds2/script/find_lost_objecter.pl
new file mode 100755
index 0000000000000..a0c2089140e23
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_lost_objecter.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+my %ack;
+my %commit;
+
+my $line = 0;
+while (<>) {
+	#print "$line: $_";
+	$line++;
+
+	#client0.objecter writex_submit tid 21 osd0  oid 100000000000001 851424~100000
+	if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) {
+#		print "$who.$tid\n";
+		$ack{"$who.$tid"} = $line;
+		$commit{"$who.$tid"} = $line;
+	}
+
+	#client1.objecter handle_osd_write_reply 304 commit 0
+	#client1.objecter handle_osd_write_reply 777 commit 1
+	if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) {
+#		print "$who.$tid\n";
+		delete $ack{"$who.$tid"};
+		delete $commit{"$who.$tid"} if $commit;
+	}
+
+}
+
+for my $op (keys %commit) {
+	print "---- lost commit $op $commit{$op}\n";
+}
+for my $op (keys %ack) {
+	print "---- lost ack $op $commit{$op}\n";
+}
diff --git a/branches/sage/cephmds2/script/find_pathpins.pl b/branches/sage/cephmds2/script/find_pathpins.pl
new file mode 100755
index 0000000000000..e4a7d81dfb7b7
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_pathpins.pl
@@ -0,0 +1,41 @@
+#!/usr/bin/perl
+
+my %pin;
+my %hist;     
+my $l = 1;
+my @pins;
+while (<>) {
+
+	# cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+	if (/path_pinned /) {
+		my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /;
+		$what = "$dname $dir";
+		#print "$l pin $what\n";
+		$pin{$what}++;
+		$hist{$what} .= "$l: $_";
+		push( @pins, $what ) unless grep {$_ eq $what} @pins;
+	}
+
+	# cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0
+
+	if (/path_unpinned/) {
+		my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /;
+		$what = "$dname $dir";
+		#print "$l unpin $what\n";
+		$pin{$what}--;
+		$hist{$what} .= "$l: $_";
+		unless ($pin{$what}) {
+			delete $hist{$what};
+			delete $pin{$what};
+			@pins = grep {$_ ne $what} @pins;
+		}
+	}
+	$l++;
+}
+
+for my $what (@pins) {
+	print "---- count $pin{$what} on $what
+$hist{$what}
+";
+}
diff --git a/branches/sage/cephmds2/script/find_requests.pl b/branches/sage/cephmds2/script/find_requests.pl
new file mode 100755
index 0000000000000..5144896249413
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_requests.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl
+
+my %waiting;  # context => what where what is "inode ..." or "dir ..."
+my %hist;     # context => history since waited
+my @waiting;
+
+my $line = 0;
+while (<>) {
+
+	#print $line . $_ if /0x8d4f6a0/;
+	$line++;
+	if (/request_start/) {
+		my ($c) = /(0x\w+)/;
+		my ($what) = $'; #';
+		chomp $what;
+		#print "$line add_waiter $c $what\n" if /0x8d4f6a0/;
+		$waiting{$c} = $what
+			if $what && !$waiting{$c};
+		$hist{$c} .= "$line: $_";
+		unless (grep {$_ eq $c} @waiting) {
+			push( @waiting, $c );
+		}
+	}
+	#if (/finish_waiting/) {
+	#	my ($c) = /(0x\w+)/;
+	#	$hist{$c} .= "$line: $_";
+	#}
+	if (/request_finish/ ||
+		/request_forward/) {
+		my ($c) = /(0x\w+)/;
+		#print "took\n" if /0x8d4f6a0/;
+		delete $waiting{$c};
+		delete $hist{$c};
+		@waiting = grep {$_ ne $c} @waiting;
+	}
+}
+
+for my $c (@waiting) {
+	print "---- lost request $c $waiting{$c}
+$hist{$c}
+";
+}
diff --git a/branches/sage/cephmds2/script/find_waiters.pl b/branches/sage/cephmds2/script/find_waiters.pl
new file mode 100755
index 0000000000000..c89d2b1a49db7
--- /dev/null
+++ b/branches/sage/cephmds2/script/find_waiters.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+my %waiting;  # context => what where what is "inode ..." or "dir ..."
+my %hist;     # context => history since waited
+my @waiting;
+
+my $line = 0;
+while (<>) {
+	#print $line . $_ if /0x8d4f6a0/;
+	$line++;
+	if (/add_waiter/) {
+		my ($c) = /(0x\w+)/;
+		my ($what) = / on (.*\])/;
+		#print "$line add_waiter $c $what\n" if /0x8d4f6a0/;
+		$waiting{$c} = $what
+			if $what && !$waiting{$c};
+		$hist{$c} .= "$line: $_";
+		unless (grep {$_ eq $c} @waiting) {
+			push( @waiting, $c );
+		}
+	}
+	#if (/finish_waiting/) {
+	#	my ($c) = /(0x\w+)/;
+	#	$hist{$c} .= "$line: $_";
+	#}
+	if (/take_waiting/) {
+		my ($c) = /(0x\w+)/;
+		if (/SKIPPING/) {
+			#print "skipping\n" if /0x8d4f6a0/;
+			$hist{$c} .= "$line: $_";
+		} elsif (/took/) {
+			#print "took\n" if /0x8d4f6a0/;
+			delete $waiting{$c};
+			delete $hist{$c};
+			@waiting = grep {$_ ne $c} @waiting;
+		} else {
+			die "i don't understand: $_";
+		}
+	}
+}
+
+for my $c (@waiting) {
+	print "---- lost waiter $c $waiting{$c}
+$hist{$c}
+";
+}
diff --git a/branches/sage/cephmds2/script/grepblock b/branches/sage/cephmds2/script/grepblock
new file mode 100755
index 0000000000000..f5acf95732abb
--- /dev/null
+++ b/branches/sage/cephmds2/script/grepblock
@@ -0,0 +1,15 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $block = shift ARGV;
+die unless int $block;
+
+while (<>) {
+	my $yes = 0;
+	for my $x (/(\d+\~\d+)/) {
+		my ($s,$l) = split(/\~/,$x);
+		$yes = 1 if ($block >= $s && $block < $s+$l);
+	}	
+	print if $yes;
+}
diff --git a/branches/sage/cephmds2/script/merge_trace_rw.pl b/branches/sage/cephmds2/script/merge_trace_rw.pl
new file mode 100644
index 0000000000000..378d629ef43f6
--- /dev/null
+++ b/branches/sage/cephmds2/script/merge_trace_rw.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl
+
+use strict;
+
+my @file = <>;
+sub get_op {
+	my @op = shift @file;
+	while (@file && 
+		   $file[0] !~ /^[a-z]+$/) {
+		push( @op, shift @file );
+	}
+	#print "op = ( @op )\n";
+	return @op;
+}
+
+my $n = 0;
+while (@file) {
+	my ($op, @args) = &get_op;
+	while ($op eq "read\n" ||
+		   $op eq "write\n") {
+		die unless scalar(@args) == 3;
+		my ($nop, @nargs) = &get_op;
+		if ($nop eq $op 
+			&& ($args[0] == $nargs[0] )
+			&& ($args[2] + $args[1] == $nargs[2])
+			) {
+			die unless scalar(@nargs) == 3;
+			$args[1] += $nargs[1];
+			$args[1] .= "\n";
+			die unless scalar(@args) == 3;
+			#print STDOUT "combining $n $op @args\n";
+			$n++;
+		} else {
+#			print STDERR "not combinging\n";
+			unshift( @file, $nop, @nargs );
+			die unless scalar(@args) == 3;
+			last;
+		}
+	}
+	print $op;
+	print join('', @args);
+}
diff --git a/branches/sage/cephmds2/script/profonly.pl b/branches/sage/cephmds2/script/profonly.pl
new file mode 100755
index 0000000000000..6a05dec473ca0
--- /dev/null
+++ b/branches/sage/cephmds2/script/profonly.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+
+my $rank = shift @ARGV;
+my $args = join(' ',@ARGV);
+if ($rank == $ENV{MPD_JRANK}) {
+	$c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args";
+} else {
+	$c = "./newsyn.nopg $args";
+}
+
+#print "$rank: $c\n";
+system $c;
diff --git a/branches/sage/cephmds2/script/runset.pl b/branches/sage/cephmds2/script/runset.pl
new file mode 100755
index 0000000000000..a1425862ceb42
--- /dev/null
+++ b/branches/sage/cephmds2/script/runset.pl
@@ -0,0 +1,380 @@
+#!/usr/bin/perl
+
+use strict;
+use Data::Dumper;
+
+=item sample input file
+
+# hi there
+{
+	# startup
+	'n' => 30,          # mpi nodes
+	'sleep' => 10,      # seconds between runs
+	'nummds' => 1,
+	'numosd' => 8,
+	'numclient' => 400,#[10, 50, 100, 200, 400],
+
+	# parameters
+	'fs' => [ 'ebofs', 'fakestore' ],
+	'until' => 150,     # --syn until $n    ... when to stop clients
+	'writefile' => 1,
+	'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ],
+	'writefile_mb' => 1000,
+
+	'custom' => '--tcp_skip_rank0 --osd_maxthreads 0';
+
+	# for final summation (script/sum.pl)
+	'start' => 30,
+	'end' => 120,
+
+	'_psub' => 'alc.tp'   # switch to psub mode!
+};
+
+=cut
+
+my $usage = "script/runset.pl [--clean] jobs/some/job blah\n";
+
+my $clean;
+my $use_srun;
+my $nobg = '&';
+my $in = shift || die $usage;
+if ($in eq '--clean') {
+	$clean = 1;
+	$in = shift || die $usage;
+}
+if ($in eq '--srun') {
+	$use_srun = 1;
+	$in = shift || die $usage;
+}
+if ($in eq '--nobg') {
+	$nobg = '';
+	$in = shift || die $usage;
+}
+my $tag = shift || die $usage;
+my $fake = shift;
+
+
+my ($job) = $in =~ /^jobs\/(.*)/;
+my ($jname) = $job =~ /\/(\w+)$/;
+$jname ||= $job;
+die "not jobs/?" unless defined $job;
+my $out = "log/$job.$tag";
+my $relout = "$job.$tag";
+
+
+my $cwd = `/bin/pwd`;
+chomp($cwd);
+
+
+
+print "# --- job $job, tag $tag ---\n";
+
+
+# get input
+my $raw = `cat $in`;
+my $sim = eval $raw;
+unless (ref $sim) {
+	print "bad input: $in\n";
+	system "perl -c $in";
+	exit 1;
+}
+
+# prep output
+system "mkdir -p $out" unless -d "$out";
+
+open(W, ">$out/in");
+print W $raw;
+close W;
+
+my $comb = $sim->{'comb'};
+delete $sim->{'comb'};
+my %filters;
+my @fulldirs;
+
+
+
+sub reset {
+	print "reset: restarting mpd in 3 seconds\n";
+	system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)";
+	print "reset: done\n";
+}
+
+
+if (`hostname` =~ /alc/ && !$use_srun) {
+	print "# this looks like alc\n";
+	$sim->{'_psub'} = 'jobs/alc.tp';
+}
+
+
+sub iterate {
+	my $sim = shift @_;
+	my $fix = shift @_ || {};
+	my $vary;
+	my @r;
+
+	my $this;
+	for my $k (sort keys %$sim) {
+		next if $k =~ /^_/;
+		if (defined $fix->{$k}) {
+			$this->{$k} = $fix->{$k};
+		}
+		elsif (ref $sim->{$k} eq 'HASH') {
+			# nothing
+		}
+		elsif (!(ref $sim->{$k})) {
+			$this->{$k} = $sim->{$k};
+		}
+		else {
+			#print ref $sim->{$k};
+			if (!(defined $vary)) {
+				$vary = $k;
+			}
+		}
+	}
+
+	if ($vary) {
+		#print "vary $vary\n";
+		for my $v (@{$sim->{$vary}}) {
+			$this->{$vary} = $v;
+			push(@r, &iterate($sim, $this));
+		}
+	} else {
+
+		if ($sim->{'_dep'}) {
+			my @s = @{$sim->{'_dep'}};
+			while (@s) {
+				my $dv = shift @s;
+				my $eq = shift @s;
+
+				$eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg;
+				$this->{$dv} = eval $eq;
+				#print "$dv : $eq -> $this->{$dv}\n";
+			}
+		}
+
+		push(@r, $this);
+	}
+	return @r;
+}
+
+
+
+sub run {
+	my $h = shift @_;
+
+	my @fn;
+	my @filt;
+	my @vals;
+	for my $k (sort keys %$sim) {
+		next if $k =~ /^_/;
+		next unless ref $sim->{$k} eq 'ARRAY';
+		push(@fn, "$k=$h->{$k}");
+		push(@vals, $h->{$k});
+		next if $comb && $k eq $comb->{'x'};
+		push(@filt, "$k=$h->{$k}");
+	}
+	my $keys = join(",", @fn);
+	$keys =~ s/ /_/g;
+	my $fn = $out . '/' . $keys;
+	my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys;
+
+	push( @fulldirs, "" . $fn );
+
+	
+	# filters
+	$filters{ join(',', @filt) } = 1;
+
+
+	#system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post");
+	if (-e "$fn/.done") {
+		print "already done.\n";
+		return;
+	}
+	system "rm -r $fn" if $clean && -d "$fn";
+	system "mkdir $fn" unless -d "$fn";
+
+	my $e = './newsyn';
+	#$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs';
+	my $c = "$e";
+	$c .= " --mkfs" unless $h->{'no_mkfs'};
+	$c .= " --$h->{'fs'}";
+	$c .= " --syn until $h->{'until'}" if $h->{'until'};
+
+	$c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'};
+	$c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'};
+	$c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'};
+	$c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'};
+
+	if ($h->{'ebofs_freelist'}) {
+		system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist";
+		$c .= " --osd_age_time -1";
+	}
+
+	for my $k ('nummds', 'numclient', 'numosd', 'kill_after',
+			   'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits',
+			   'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife',
+			   'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr',
+			   'mds_cache_size','mds_log_max_len',
+			   'mds_local_osd',
+			   'osd_age_time','osd_age',
+			   'osd_rep',
+			   'osd_pad_pg_log','ebofs_realloc',
+			   'osd_balance_reads',
+			   'tcp_multi_out',
+			   'client_cache_stat_ttl','client_cache_readdir_ttl',
+			   'client_oc',
+			   'fake_osdmap_updates',
+			   'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', 
+			   'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc',
+			   'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep',
+			   'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep',
+			   'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') {
+		$c .= " --$k $h->{$k}" if defined $h->{$k};
+	}
+
+	$c .= ' ' . $h->{'custom'} if $h->{'custom'};
+
+	$c .= " --log_name $relout/$keys";
+
+	my $post = "#!/bin/sh
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl
+touch $fn/.post
+";
+	open(O,">$fn/sh.post");
+	print O $post;
+	close O;
+
+	my $killmin = 1 + int ($h->{'kill_after'} / 60);
+	
+	$c = "bash -c \"ulimit -c 0 ; $c\"";
+	#$c = "bash -c \"$c\"";
+
+	my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest";
+	my $mpiexec = "mpiexec -l -n $h->{'n'}";
+	my $launch;
+	if ($use_srun)  {
+		$launch = $srun;
+	} else {
+		$launch = $mpiexec;
+	}
+	
+	if ($sim->{'_psub'}) {
+		# template!
+		my $tp = `cat $sim->{'_psub'}`;
+		$tp =~ s/\$CWD/$cwd/g;
+		$tp =~ s/\$NAME/$name/g;
+		$tp =~ s/\$NUM/$h->{'n'}/g;
+		$tp =~ s/\$OUT/$fn\/o/g;
+		$tp =~ s/\$DONE/$fn\/.done/g;
+		$tp =~ s/\$CMD/$c/g;
+		open(O,">$out/$name");
+		print O $tp;
+		close O;
+		print "\npsub $out/$name\n";
+		return;
+	} else {
+		# run
+		my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";#
+		#my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done";
+		print "$cmd $nobg\n";
+		my $r = undef;
+		unless ($fake) {
+			if ($sim->{'_pre'}) {
+				print "pre: $launch $sim->{'_pre'}\n";
+				system "$launch $sim->{'_pre'}";
+			}
+			$r = system $cmd;
+			if ($sim->{'_post'}) {
+				print "post: $launch $sim->{'_post'}\n";
+				system "$launch $sim->{'_post'}";
+			}
+			if ($r) {
+				print "r = $r\n";
+				#&reset;
+			}
+			system "sh $fn/sh.post";
+		}
+		return $r;
+	}
+}
+
+
+
+my @r = &iterate($sim);
+my $n = scalar(@r);
+my $c = 1;
+my %r;
+my $nfailed = 0;
+for my $h (@r) {
+	my $d = `date`;
+	chomp($d);
+	$d =~ s/ P.T .*//;
+	print "# === $c/$n";
+	print " ($nfailed failed)" if $nfailed;
+	print " $d: ";
+	my $r = &run($h);
+
+	if (!(defined $r)) {
+		# already done
+	} else {
+		if ($r) {
+			$nfailed++;
+		}
+		print "sleep $h->{'sleep'}\n";
+		sleep $h->{'sleep'};
+	}
+
+	$c++;
+}
+print "$nfailed failed\n";
+
+
+my @comb;
+if ($comb) {
+	my $x = $comb->{'x'};
+	my @vars = @{$comb->{'vars'}};
+
+	print "\n\n# post\n";
+	for my $p (@fulldirs) {
+		print "sh $p/sh.post\n";
+	}
+
+	my @filters = sort keys %filters;
+	my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c";
+	print "$cmd\n";
+	open(O,">$out/comb");
+	print O "$cmd\n";
+	close O;
+	system $cmd;
+
+	print "\n\n";
+
+	my $plot;
+	$plot .= "set data style linespoints;\n";
+	my $s = 2;
+	for my $v (@vars) {
+		my $c = $s;
+		$s++;
+		my @p;
+		for my $f (@filters) {
+			my $t = $f;
+			if ($comb->{'maptitle'}) {
+				for my $a (keys %{$comb->{'maptitle'}}) {
+					my $b = $comb->{'maptitle'}->{$a};
+					$t =~ s/$a/$b/;
+				}
+			}
+			push (@p, "\"$out/c\" u 1:$c t \"$t\"" );
+			$c += scalar(@vars);
+		}
+		$plot .= "# $v\nplot " . join(", ", @p) . ";\n\n";
+	}
+	print $plot;
+	open(O,">$out/plot");
+	print O $plot;
+	close O;
+}
+
diff --git a/branches/sage/cephmds2/script/sum.pl b/branches/sage/cephmds2/script/sum.pl
new file mode 100755
index 0000000000000..92ef9a9b222a8
--- /dev/null
+++ b/branches/sage/cephmds2/script/sum.pl
@@ -0,0 +1,148 @@
+#!/usr/bin/perl
+
+use strict;
+my $starttime = 1;
+my $endtime = -1;
+
+my $avgrows = 0;
+
+while ($ARGV[0] =~ /^-/) {
+	$_ = shift @ARGV;
+	if ($_ eq '-avg') {
+		$avgrows = 1;
+	}
+	elsif ($_ eq '-start') {
+		$starttime = shift @ARGV;
+	}
+	elsif ($_ eq '-end') {
+		$endtime = shift @ARGV;
+	}
+	else {
+		die "i don't understand arg $_";
+	}
+}
+my @files = @ARGV;
+
+if (scalar(@files) == 1 && $files[0] =~ /\*/) {
+	my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/;
+	@files = ();
+	$pat =~ s/\*//;
+#	print "dir $dir pat $pat\n";
+	opendir(D,"$dir");
+	for my $f (readdir(D)) {
+	#	print "$f\n";
+		next unless $f =~ /^$pat/;
+		push(@files, "$dir/$f");
+	}
+	closedir(D);
+	
+#	print "files = @files\n";
+}
+
+my @data;
+for my $f (@files) {
+	open(I,$f);
+	push( @data, <I> );
+	close I;
+}
+
+my %sum;  # time -> name -> val
+my %col;  # colnum -> name   .. colnums start at 0 (time doesn't count)
+my %min;
+my %max;
+my %avg;
+my %tcount;
+my $files;
+for (@data) {
+	chomp;
+	my @r = split(/\s+/,$_);
+	my $r = shift @r;
+	
+	# column headings?
+	if ($r =~ /^\#/) {
+		my $num = 0;
+		while (my $name = shift @r) {
+			$col{$num} = $name;
+			$num++;
+		}
+		next;
+	}
+
+	next unless int $r;
+	next if $r < $starttime;
+	next if $endtime > 0 && $r > $endtime;
+
+	$tcount{$r}++;
+	$files = $tcount{$r} if $tcount{$r} > $files;
+	#print "$r: @r\n";
+	my $i = 0;
+	while (@r) {
+		my $v = shift @r;
+		$sum{$r}->{$col{$i}} += $v; # if $v > 0;
+
+		$min{$col{$i}} = $v
+			if ($min{$col{$i}} > $v || !(defined $min{$col{$i}}));
+		$max{$col{$i}} = $v 
+			if ($max{$col{$i}} < $v);
+
+		$avg{$col{$i}} += $v;
+		$i++;
+	}
+}
+
+## dump
+my @c = sort {$a <=> $b} keys %col;
+# cols
+print join("\t",'#', map { $col{$_} } @c) . "\n";
+my $n = 0;
+for my $k (sort {$a <=> $b} keys %sum) {
+	if ($avgrows) {
+		print join("\t",$k, #map int, 
+				   map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n";
+	} else {
+		print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n";
+	}
+	$n++;
+}
+
+my $rows = $n || 1;
+#my $files = $tcount{$starttime};
+my %avgval;
+
+## devt
+#warn "rows $rows, files $files\n";
+my %avgvalvart;  # std dev of each col avg, over time
+for my $k (keys %avg) {
+	my $av = $avgval{$k} = $avg{$k} / ($rows*$files);
+
+	my $var = 0.0;
+	for my $t (sort {$a <=> $b} keys %sum) {
+		my $a = $sum{$t}->{$k} / $files;
+		$var += ($a - $av) * ($a - $av);
+	}
+	
+	$avgvalvart{$k} = $var / $rows;
+}
+
+
+
+
+print "\n";
+print join("\t",'#', map { $col{$_} } @c) . "\n";
+print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n";
+print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n";
+print join("\t", '#rows', map { $rows } @c) . "\n";
+print join("\t", '#files', map { $files } @c) . "\n";
+print join("\t", '#sum', 
+		   map { $avg{$col{$_}} } @c ) . "\n";
+print join("\t", '#avgval', #map int, 
+		   map { $avgval{$col{$_}} } @c ) . "\n";
+#		   map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n";
+
+print join("\t", '#avgvalvart',
+		   map { $avgvalvart{$col{$_}} } @c ) . "\n";
+print join("\t", '#avgvaldevt',
+		   map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n";
+
+print join("\t", '#avgsum', #map int, 
+		   map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n";
diff --git a/branches/sage/cephmds2/tcpfuse.cc b/branches/sage/cephmds2/tcpfuse.cc
new file mode 100644
index 0000000000000..3d7be50d377d6
--- /dev/null
+++ b/branches/sage/cephmds2/tcpfuse.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "msg/TCPMessenger.h"
+
+#include "common/Timer.h"
+       
+#include <envz.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv, char *envp[]) {
+
+  //cerr << "tcpfuse starting " << myrank << "/" << world << endl;
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+  parse_config_options(args);
+
+  // args for fuse
+  vec_to_argv(args, argc, argv);
+
+  // start up tcpmessenger
+  tcpaddr_t nsa;
+  if (tcpmessenger_findns(nsa) < 0) exit(1);
+  tcpmessenger_init();
+  tcpmessenger_start();
+  tcpmessenger_start_rankserver(nsa);
+  
+  Client *client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));
+  client->init();
+    
+  // start up fuse
+  // use my argc, argv (make sure you pass a mount point!)
+  cout << "mounting" << endl;
+  client->mount();
+  
+  cerr << "starting fuse on pid " << getpid() << endl;
+  ceph_fuse_main(client, argc, argv);
+  cerr << "fuse finished on pid " << getpid() << endl;
+  
+  client->unmount();
+  cout << "unmounted" << endl;
+  client->shutdown();
+  
+  delete client;
+  
+  // wait for it to finish
+  tcpmessenger_wait();
+  tcpmessenger_shutdown();  // shutdown MPI
+
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/tcpsyn.cc b/branches/sage/cephmds2/tcpsyn.cc
new file mode 100644
index 0000000000000..cc9f470640c36
--- /dev/null
+++ b/branches/sage/cephmds2/tcpsyn.cc
@@ -0,0 +1,292 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+#include "client/SyntheticClient.h"
+
+#include "msg/TCPMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+  void finish(int r) {
+    cout << "C_Test->finish(" << r << ")" << endl;
+  }
+};
+
+
+#include "msg/mpistarter.cc"
+
+utime_t tick_start;
+int tick_count = 0;
+
+class C_Tick : public Context {
+public:
+  void finish(int) {
+    utime_t now = g_clock.now() - tick_start;
+    dout(0) << "tick +" << g_conf.tick << " -> " << now << "  (" << tick_count << ")" << endl;
+    tick_count += g_conf.tick;
+    utime_t next = tick_start;
+    next.sec_ref() += tick_count;
+    g_timer.add_event_at(next, new C_Tick);
+  }
+};
+
+class C_Die : public Context {
+public:
+  void finish(int) {
+    cerr << "die" << endl;
+    exit(1);
+  }
+};
+
+class C_Debug : public Context {
+  public:
+  void finish(int) {
+    int size = &g_conf.debug_after - &g_conf.debug;
+    memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+    dout(0) << "debug_after flipping debug settings" << endl;
+  }
+};
+
+
+int main(int argc, char **argv) 
+{
+  vector<char*> args;
+  argv_to_vec(argc, argv, args);
+
+  parse_config_options(args);
+
+  parse_syn_options(args);
+
+  if (g_conf.kill_after) 
+    g_timer.add_event_after(g_conf.kill_after, new C_Die);
+  if (g_conf.debug_after) 
+    g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+  if (g_conf.tick) {
+    tick_start = g_clock.now();
+    g_timer.add_event_after(g_conf.tick, new C_Tick);
+  }
+
+  vector<char*> nargs;
+  for (unsigned i=0; i<args.size(); i++) {
+    //cout << "a " << args[i] << endl;
+    // unknown arg, pass it on.
+    nargs.push_back(args[i]);
+  }
+
+  args = nargs;
+  if (!args.empty()) {
+    for (unsigned i=0; i<args.size(); i++)
+      cerr << "stray arg " << args[i] << endl;
+  }
+  assert(args.empty());
+
+
+  // start up tcp messenger via MPI
+  pair<int,int> mpiwho = mpi_bootstrap_tcp(argc, argv);
+  int myrank = mpiwho.first;
+  int world = mpiwho.second;
+
+  int need = 0;
+  if (g_conf.tcp_skip_rank0) need++;
+  need += NUMMDS;
+  need += NUMOSD;
+  if (NUMCLIENT) {
+    if (!g_conf.tcp_overlay_clients)
+      need += 1;
+  }
+  assert(need <= world);
+
+  if (myrank == 0)
+    cerr << "nummds " << NUMMDS << "  numosd " << NUMOSD << "  numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl;
+  
+  MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+
+  char hostname[100];
+  gethostname(hostname,100);
+  int pid = getpid();
+
+  int started = 0;
+
+  //if (myrank == 0) g_conf.debug = 20;
+  
+  // create mon
+  if (myrank == 0) {
+    Monitor *mon = new Monitor(0, new TCPMessenger(MSG_ADDR_MON(0)));
+    mon->init();
+  }
+
+  // create mds
+  MDS *mds[NUMMDS];
+  OSD *mdsosd[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != g_conf.tcp_skip_rank0+i) continue;
+    TCPMessenger *m = new TCPMessenger(MSG_ADDR_MDS(i));
+    cerr << "mds" << i << " on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+    mds[i] = new MDS(mdc, i, m);
+    mds[i]->init();
+    started++;
+
+    if (g_conf.mds_local_osd) {
+      mdsosd[i] = new OSD(i+10000, new TCPMessenger(MSG_ADDR_OSD(i+10000)));
+      mdsosd[i]->init();                                                    
+    }
+  }
+  
+  // create osd
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    if (myrank != g_conf.tcp_skip_rank0+NUMMDS + i) continue;
+    TCPMessenger *m = new TCPMessenger(MSG_ADDR_OSD(i));
+    cerr << "osd" << i << " on tcprank " << tcpmessenger_get_rank() <<  " " << hostname << "." << pid << endl;
+    osd[i] = new OSD(i, m);
+    osd[i]->init();
+    started++;
+  }
+  
+  if (g_conf.tcp_overlay_clients) sleep(5);
+
+  // create client
+  int skip_osd = NUMOSD;
+  if (g_conf.tcp_overlay_clients) 
+    skip_osd = 0;        // put clients with osds too!
+  int client_nodes = world - NUMMDS - skip_osd - g_conf.tcp_skip_rank0;
+  int clients_per_node = 1;
+  if (NUMCLIENT) clients_per_node = (NUMCLIENT-1) / client_nodes + 1;
+  set<int> clientlist;
+  Client *client[NUMCLIENT];
+  SyntheticClient *syn[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    //if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
+    if (myrank != g_conf.tcp_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
+    clientlist.insert(i);
+    client[i] = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));//(i)) );
+
+    // logger?
+    if (client_logger == 0) {
+      char s[80];
+      sprintf(s,"clnode.%d", myrank);
+      client_logger = new Logger(s, &client_logtype);
+
+      client_logtype.add_inc("lsum");
+      client_logtype.add_inc("lnum");
+      client_logtype.add_inc("lwsum");
+      client_logtype.add_inc("lwnum");
+      client_logtype.add_inc("lrsum");
+      client_logtype.add_inc("lrnum");
+      client_logtype.add_inc("trsum");
+      client_logtype.add_inc("trnum");
+      client_logtype.add_inc("wrlsum");
+      client_logtype.add_inc("wrlnum");
+      client_logtype.add_inc("lstatsum");
+      client_logtype.add_inc("lstatnum");
+      client_logtype.add_inc("ldirsum");
+      client_logtype.add_inc("ldirnum");
+      client_logtype.add_inc("readdir");
+      client_logtype.add_inc("stat");
+    }
+
+    client[i]->init();
+    started++;
+
+    syn[i] = new SyntheticClient(client[i]);
+  }
+
+  if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
+
+  int nclients = 0;
+  for (set<int>::iterator it = clientlist.begin();
+       it != clientlist.end();
+       it++) {
+    int i = *it;
+
+    //cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
+    client[i]->mount();
+    syn[i]->start_thread();
+    
+    nclients++;
+  }
+  if (nclients) {
+    cerr << nclients << " clients on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+  }
+
+  for (set<int>::iterator it = clientlist.begin();
+       it != clientlist.end();
+       it++) {
+    int i = *it;
+
+    //      cout << "waiting for synthetic client" << i << " to finish" << endl;
+    syn[i]->join_thread();
+    delete syn[i];
+    
+    client[i]->unmount();
+    //cout << "client" << i << " unmounted" << endl;
+    client[i]->shutdown();
+  }
+  
+
+  if (myrank && !started) {
+    //dout(1) << "IDLE" << endl;
+    cerr << "idle on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl; 
+    tcpmessenger_stop_rankserver();
+  }
+
+  // wait for everything to finish
+  tcpmessenger_wait();
+
+  if (started) cerr << "tcpsyn finishing" << endl;
+  
+  tcpmessenger_shutdown(); 
+  
+
+  /*
+  // cleanup
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+    delete client[i];
+  }
+  */
+  delete mdc;
+
+  
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/test/fakemds.cc b/branches/sage/cephmds2/test/fakemds.cc
new file mode 100644
index 0000000000000..b75b62d58152c
--- /dev/null
+++ b/branches/sage/cephmds2/test/fakemds.cc
@@ -0,0 +1,104 @@
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "fakeclient/FakeClient.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDCache.h"
+#include "mds/MDStore.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "messages/MPing.h"
+
+using namespace std;
+
+__uint64_t ino = 1;
+
+
+
+#include "config.h"
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_fakeclient
+
+// this parses find output
+int play();
+
+int main(int oargc, char **oargv) {
+  cerr << "hi there" << endl;
+
+  int argc;
+  char **argv;
+  parse_config_options(oargc, oargv,
+                       argc, argv);
+  
+  MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+  
+  // local config settings
+  g_conf.num_client = g_conf.num_fakeclient;  // to fool mds, hack gross
+
+  // create osds
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)));
+    osd[i]->init();
+  }
+
+  // create mds
+  MDS *mds[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    mds[i] = new MDS(mdc, i, new FakeMessenger(MSG_ADDR_MDS(i)));
+    mds[i]->init();
+  }
+ 
+  
+  // create clients
+  FakeClient *client[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    client[i] = new FakeClient(mdc, i, new FakeMessenger(MSG_ADDR_CLIENT(i)), g_conf.fakeclient_requests);
+    client[i]->init();
+  }
+  
+  // mount clients
+  for (int i=0; i<NUMCLIENT; i++) 
+    //for (int i=0; i<1; i++) 
+    client[i]->mount();
+
+  // loop
+  fakemessenger_do_loop();
+
+  //mds[0]->shutdown_start();
+  //fakemessenger_do_loop();
+
+  // 
+  if (argc > 1 && 
+      strcmp(argv[1], "nocheck") == 0) {
+    cerr << "---- nocheck" << endl;
+  } else {
+    cout << "---- check ----" << endl;
+    for (int i=0; i<NUMMDS; i++) 
+      mds[i]->mdcache->shutdown_pass();
+  }
+  
+  // cleanup
+  cout << "cleanup" << endl;
+  for (int i=0; i<NUMMDS; i++) {
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    delete client[i];
+  }
+  delete mdc;
+  cout << "done." << endl;
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/test/gprof-helper.c b/branches/sage/cephmds2/test/gprof-helper.c
new file mode 100644
index 0000000000000..9f013ba5b630e
--- /dev/null
+++ b/branches/sage/cephmds2/test/gprof-helper.c
@@ -0,0 +1,120 @@
+/* gprof-helper.c -- preload library to profile pthread-enabled programs
+ *
+ * Authors: Sam Hocevar <sam at zoy dot org>
+ *          Daniel JÃ¶nsson <danieljo at fagotten dot org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the Do What The Fuck You Want To
+ *  Public License as published by Banlu Kemiyatorn. See
+ *  http://sam.zoy.org/projects/COPYING.WTFPL for more details.
+ *
+ * Compilation example:
+ * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl
+ *
+ * Usage example:
+ * LD_PRELOAD=./gprof-helper.so your_program
+ */
+
+#define _GNU_SOURCE
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <pthread.h>
+
+static void * wrapper_routine(void *);
+
+/* Original pthread function */
+static int (*pthread_create_orig)(pthread_t *__restrict,
+                                  __const pthread_attr_t *__restrict,
+                                  void *(*)(void *),
+                                  void *__restrict) = NULL;
+
+/* Library initialization function */
+void wooinit(void) __attribute__((constructor));
+
+void wooinit(void)
+{
+    pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create");
+    fprintf(stderr, "pthreads: using profiling hooks for gprof\n");
+    if(pthread_create_orig == NULL)
+    {
+        char *error = dlerror();
+        if(error == NULL)
+        {
+            error = "pthread_create is NULL";
+        }
+        fprintf(stderr, "%s\n", error);
+        exit(EXIT_FAILURE);
+    }
+}
+
+/* Our data structure passed to the wrapper */
+typedef struct wrapper_s
+{
+    void * (*start_routine)(void *);
+    void * arg;
+
+    pthread_mutex_t lock;
+    pthread_cond_t  wait;
+
+    struct itimerval itimer;
+
+} wrapper_t;
+
+/* The wrapper function in charge for setting the itimer value */
+static void * wrapper_routine(void * data)
+{
+    /* Put user data in thread-local variables */
+    void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine;
+    void * arg = ((wrapper_t*)data)->arg;
+
+    /* Set the profile timer value */
+    setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL);
+
+    /* Tell the calling thread that we don't need its data anymore */
+    pthread_mutex_lock(&((wrapper_t*)data)->lock);
+    pthread_cond_signal(&((wrapper_t*)data)->wait);
+    pthread_mutex_unlock(&((wrapper_t*)data)->lock);
+
+    /* Call the real function */
+    return start_routine(arg);
+}
+
+/* Our wrapper function for the real pthread_create() */
+int pthread_create(pthread_t *__restrict thread,
+                   __const pthread_attr_t *__restrict attr,
+                   void * (*start_routine)(void *),
+                   void *__restrict arg)
+{
+    wrapper_t wrapper_data;
+    int i_return;
+
+    /* Initialize the wrapper structure */
+    wrapper_data.start_routine = start_routine;
+    wrapper_data.arg = arg;
+    getitimer(ITIMER_PROF, &wrapper_data.itimer);
+    pthread_cond_init(&wrapper_data.wait, NULL);
+    pthread_mutex_init(&wrapper_data.lock, NULL);
+    pthread_mutex_lock(&wrapper_data.lock);
+
+    /* The real pthread_create call */
+    i_return = pthread_create_orig(thread,
+                                   attr,
+                                   &wrapper_routine,
+                                   &wrapper_data);
+
+    /* If the thread was successfully spawned, wait for the data
+     * to be released */
+    if(i_return == 0)
+    {
+        pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock);
+    }
+
+    pthread_mutex_unlock(&wrapper_data.lock);
+    pthread_mutex_destroy(&wrapper_data.lock);
+    pthread_cond_destroy(&wrapper_data.wait);
+
+    return i_return;
+}
+
diff --git a/branches/sage/cephmds2/test/makedirs.cc b/branches/sage/cephmds2/test/makedirs.cc
new file mode 100644
index 0000000000000..8fd74d996ef9f
--- /dev/null
+++ b/branches/sage/cephmds2/test/makedirs.cc
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <string>
+using namespace std;
+
+int make_dirs(const char *basedir, int dirs, int files, int depth)
+{
+  //if (time_to_stop()) return 0;
+
+  // make sure base dir exists
+  int r = mkdir(basedir, 0755);
+  if (r != 0) {
+    cout << "can't make base dir? " << basedir << endl;
+    return -1;
+  }
+
+  // children
+  char d[500];
+  cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+  for (int i=0; i<files; i++) {
+    sprintf(d,"%s/file.%d", basedir, i);
+    mknod(d, 0644);
+  }
+
+  if (depth == 0) return 0;
+
+  for (int i=0; i<dirs; i++) {
+    sprintf(d, "%s/dir.%d", basedir, i);
+    make_dirs(d, dirs, files, depth-1);
+  }
+  
+  return 0;
+}
+
+int main()
+{
+  make_dirs("blah", 10, 10, 4);
+
+}
diff --git a/branches/sage/cephmds2/test/mpitest.cc b/branches/sage/cephmds2/test/mpitest.cc
new file mode 100644
index 0000000000000..bab0a79931e8e
--- /dev/null
+++ b/branches/sage/cephmds2/test/mpitest.cc
@@ -0,0 +1,111 @@
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "fakeclient/FakeClient.h"
+
+#include "mds/MDCache.h"
+#include "mds/MDStore.h"
+
+#include "msg/MPIMessenger.h"
+//#include "msg/CheesySerializer.h"
+
+#include "messages/MPing.h"
+
+
+__uint64_t ino = 1;
+
+
+
+#include "config.h"
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+// this parses find output
+int play();
+
+int main(int argc, char **argv) {
+  cout << "mpitest starting" << endl;
+
+  int myrank = mpimessenger_init(argc, argv);
+  int world = mpimessenger_world();
+
+
+
+  MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+  
+  // create osds
+  OSD *osd[NUMOSD];
+  for (int i=0; i<NUMOSD; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+    osd[i] = new OSD(i, new MPIMessenger(MSG_ADDR_OSD(i)));
+    osd[i]->init();
+  }
+  
+  // create mds
+  MDS *mds[NUMMDS];
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+    mds[i] = new MDS(mdc, i, new MPIMessenger(MSG_ADDR_MDS(i)));
+    mds[i]->init();
+  }
+  
+  // create clients
+  FakeClient *client[NUMCLIENT];
+  for (int i=0; i<NUMCLIENT; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+
+    MPIMessenger *real = new MPIMessenger(MSG_ADDR_CLIENT(i));
+    CheesySerializer *serializer = new CheesySerializer(real);
+    real->set_dispatcher(serializer);   
+
+    client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests);
+    client[i]->init();
+  }
+  
+  // seed initial requests
+  for (int i=0; i<NUMCLIENT; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+    client[i]->issue_request();
+  }
+
+  mpimessenger_start();     // start message loop
+  mpimessenger_wait();      // wait for thread to finish
+  mpimessenger_shutdown();  // shutdown MPI
+
+  // 
+  /*
+  cout << "---- check ----" << endl;
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+    mds[i]->mdcache->shutdown_pass();
+  }
+  */
+
+  // cleanup
+  //cout << "cleanup" << endl;
+  for (int i=0; i<NUMMDS; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+    delete mds[i];
+  }
+  for (int i=0; i<NUMOSD; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+    delete osd[i];
+  }
+  for (int i=0; i<NUMCLIENT; i++) {
+    if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+    delete client[i];
+  }
+  delete mdc;
+  
+  //cout << "done." << endl;
+  return 0;
+}
+
diff --git a/branches/sage/cephmds2/test/mttest.cc b/branches/sage/cephmds2/test/mttest.cc
new file mode 100644
index 0000000000000..f4b8ad5a4b922
--- /dev/null
+++ b/branches/sage/cephmds2/test/mttest.cc
@@ -0,0 +1,140 @@
+// Check that MTMessenger properly dispatches replies to the correct
+// thread.  Processes with mutliple threads of clients send a
+// "request" to a server, which then sends back a "reply".  The client
+// checks that it received the correct reply for its request.  The
+// request and reply are both an MClientRequest, which we used because
+// it allows us to pass an arbitrary string in the sarg field.  In the
+// request, the sarg field contains a string "rN:tN:mN" which uniquely
+// identifies a request by rank (process), thread and message.  The
+// server sends the reply with the sarg field set to "rN:tN:mN reply",
+// and the client can the verify it receive the correct reply for its
+// request.
+
+#include <pthread.h>
+#include "mpi.h"
+
+#include "messages/MClientRequest.h"
+#include "msg/MTMessenger.h"
+#include "include/error.h"
+
+#define SARG_SIZE 64
+#define SERVER_RANK 0
+#define NTHREADS 11        // number of threads per rank
+#define NMESSAGES 31        // number of messages per thread
+
+static void server_loop(MTMessenger &msgr, int world_size)
+{
+    // we expect this many messages from clients, then we quit
+    // (world_size-1 since server is one of the processes).
+    int totmsg = NTHREADS * NMESSAGES * (world_size - 1);
+    int nmsg = 0;
+
+    char buf[SARG_SIZE];
+
+    while(nmsg < totmsg) {
+    MClientRequest *req = (MClientRequest*)msgr.recvreq();
+    ASSERT(req->get_type() == MSG_CLIENT_REQUEST);
+    
+    //cout << "Server acknowledging " << req->get_sarg() << endl;
+
+    sprintf(buf, "%s reply", req->get_sarg().c_str());
+    MClientRequest resp(0, 0);
+    resp.set_sarg(buf);
+    msgr.sendresp(req, &resp);
+
+    delete req;
+    nmsg++;
+    }
+
+    cout << "Server successful" << endl;
+}
+
+// arguments for client thread start function (see pthread_create)
+struct client_arg
+{
+    MTMessenger *msgr;
+    int rank;
+    int thread;
+};
+
+static void *client_session(void *_carg)
+{
+    client_arg *carg = (client_arg *)_carg;
+
+    char buf[SARG_SIZE];
+
+    // repeat some number (arbitrary really) of rounds
+    for (int i = 0; i < NMESSAGES; i++) {
+
+    // send the message, receive the reply and check reply is as
+    // expected
+
+    MClientRequest request(0, 0);
+    sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i);
+    request.set_sarg(buf);
+
+    //cout << "Client sending " << request.get_sarg() << endl;
+
+    MClientRequest *resp =
+        (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK);
+
+    ASSERT(resp->get_type() == MSG_CLIENT_REQUEST);
+    sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i);
+    ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0);
+
+    //cout << "Client verified " << resp->get_sarg() << endl;
+
+    delete resp;
+    }
+
+    cout << "Client (" << carg->rank << "," << carg->thread
+     <<  ") successful" << endl;
+
+    delete carg;
+    return NULL;
+}
+
+static void launch_clients(MTMessenger &msgr, int rank)
+{
+    pthread_t tid[NTHREADS];
+
+    // launch some number (arbitrary really) of threads
+    for (int i = 0; i < NTHREADS; i++) {
+
+    client_arg *carg = (client_arg*)malloc(sizeof(client_arg));
+    ASSERT(carg);
+    carg->msgr = &msgr;
+    carg->rank = rank;
+    carg->thread = i;
+
+    if (pthread_create(&tid[i], NULL, client_session, carg) < 0)
+        SYSERROR();
+    }
+
+    // we must wait for all the threads to exit before returning,
+    // otherwise we shutdown MPI before while the threads are
+    // chatting.
+    for (int i = 0; i < NTHREADS; i++) {
+    void *retval;
+
+    if (pthread_join(tid[i], &retval) < 0)
+        SYSERROR();
+    }
+}
+
+int main(int argc, char **argv)
+{
+    MTMessenger msgr(argc, argv);
+
+    int rank;
+    ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS);
+    int world_size;
+    ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS);
+
+    if (rank == SERVER_RANK)
+    server_loop(msgr, world_size);
+    else
+    launch_clients(msgr, rank);
+
+    return 0;
+}
diff --git a/branches/sage/cephmds2/test/rushconfig b/branches/sage/cephmds2/test/rushconfig
new file mode 100644
index 0000000000000..40d82702ea0a5
--- /dev/null
+++ b/branches/sage/cephmds2/test/rushconfig
@@ -0,0 +1,7 @@
+6
+8 10.0
+4 20.0
+7 30.0
+9 10.0
+8 15.0
+5 11.0
diff --git a/branches/sage/cephmds2/test/rushtest.cc b/branches/sage/cephmds2/test/rushtest.cc
new file mode 100644
index 0000000000000..ecff83523e0c6
--- /dev/null
+++ b/branches/sage/cephmds2/test/rushtest.cc
@@ -0,0 +1,49 @@
+//
+// $Id$
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "../osd/rush.h"
+
+main (int argc, char *argv[])
+{
+  Rush    rush;
+  char    buf[200];
+  int    i, j, k, numClusters;
+  int    numKeys = 5;
+  int    numReplicas = 4;
+  int    curSize;
+  double    curWeight;
+  int    servers[1000];
+
+  if (argc > 1) {
+    numKeys = atoi (argv[1]);
+  }
+  if (argc > 2) {
+    numReplicas = atoi (argv[2]);
+  }
+
+  fgets (buf, sizeof (buf) - 2, stdin);
+  sscanf (buf, "%d", &numClusters);
+  for (i = 0; i < numClusters; i++) {
+    fgets (buf, sizeof (buf) - 2, stdin);
+    sscanf (buf, "%d %lf", &curSize, &curWeight);
+    rush.AddCluster (curSize, curWeight);
+    if (rush.Servers () < numReplicas) {
+      fprintf (stderr, "ERROR: must have at least %d disks in the system!\n",
+           rush.Clusters ());
+      exit (-1);
+    }
+    for (j = 0; j < numKeys; j++) {
+      rush.GetServersByKey (j, numReplicas, servers);
+#if 0
+      printf ("%-3d %-6d ", i, j);
+      for (k = 0; k < numReplicas; k++) {
+    printf ("%-5d ", servers[k]);
+      }
+      putchar ('\n');
+#endif
+    }
+  }
+}
diff --git a/branches/sage/cephmds2/test/rushtest.cc~ b/branches/sage/cephmds2/test/rushtest.cc~
new file mode 100644
index 0000000000000..0b9512ccd0c3d
--- /dev/null
+++ b/branches/sage/cephmds2/test/rushtest.cc~
@@ -0,0 +1,49 @@
+//
+// $Id$
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "rush.h"
+
+main (int argc, char *argv[])
+{
+  Rush	rush;
+  char	buf[200];
+  int	i, j, k, numClusters;
+  int	numKeys = 5;
+  int	numReplicas = 4;
+  int	curSize;
+  double	curWeight;
+  int	servers[1000];
+
+  if (argc > 1) {
+    numKeys = atoi (argv[1]);
+  }
+  if (argc > 2) {
+    numReplicas = atoi (argv[2]);
+  }
+
+  fgets (buf, sizeof (buf) - 2, stdin);
+  sscanf (buf, "%d", &numClusters);
+  for (i = 0; i < numClusters; i++) {
+    fgets (buf, sizeof (buf) - 2, stdin);
+    sscanf (buf, "%d %lf", &curSize, &curWeight);
+    rush.AddCluster (curSize, curWeight);
+    if (rush.Servers () < numReplicas) {
+      fprintf (stderr, "ERROR: must have at least %d disks in the system!\n",
+	       rush.Clusters ());
+      exit (-1);
+    }
+    for (j = 0; j < numKeys; j++) {
+      rush.GetServersByKey (j, numReplicas, servers);
+#if 0
+      printf ("%-3d %-6d ", i, j);
+      for (k = 0; k < numReplicas; k++) {
+	printf ("%-5d ", servers[k]);
+      }
+      putchar ('\n');
+#endif
+    }
+  }
+}
diff --git a/branches/sage/cephmds2/test/testbucket.cc b/branches/sage/cephmds2/test/testbucket.cc
new file mode 100644
index 0000000000000..d8676da18faba
--- /dev/null
+++ b/branches/sage/cephmds2/test/testbucket.cc
@@ -0,0 +1,67 @@
+
+
+#include "../crush/Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+  out << "[";
+  for (int i=0; i<v.size(); i++) {
+    if (i) out << " ";
+    out << v[i];
+  }
+  out << "]";
+  return out;
+}
+
+
+int main() 
+{
+  Hash h(73);
+
+  vector<int> disks;
+  for (int i=0; i<20; i++)
+    disks.push_back(i);
+
+
+  /*
+  UniformBucket ub(1, 1, 0, 10, disks);
+  ub.make_primes(h);
+  cout << "primes are " << ub.primes << endl;
+  */
+
+  MixedBucket mb(2, 1);
+  for (int i=0;i<20;i++)
+    mb.add_item(i, 10);
+
+  /*
+  MixedBucket b(3, 1);
+  b.add_item(1, ub.get_weight());
+  b.add_item(2, mb.get_weight());
+  */
+  MixedBucket b= mb;
+
+  vector<int> ocount(disks.size());
+  int numrep = 3;
+
+  vector<int> v(numrep);
+  for (int x=1; x<1000000; x++) {
+    //cout << H(x) << "\t" << h(x) << endl;
+    for (int i=0; i<numrep; i++) {
+      int d = b.choose_r(x, i, h);
+      v[i] = d;
+      ocount[d]++;
+    }
+    //cout << v << "\t" << endl;//ocount << endl;
+  }
+
+  for (int i=0; i<ocount.size(); i++) {
+    cout << "disk " << i << " has " << ocount[i] << endl;
+  }
+
+}
diff --git a/branches/sage/cephmds2/test/testbuffers.cc b/branches/sage/cephmds2/test/testbuffers.cc
new file mode 100644
index 0000000000000..be2298ff838d1
--- /dev/null
+++ b/branches/sage/cephmds2/test/testbuffers.cc
@@ -0,0 +1,40 @@
+
+#include <iostream>
+using namespace std;
+
+#include "include/bufferlist.h"
+
+
+int main()
+{
+
+  bufferptr p1 = new buffer("123456",6);
+  bufferptr p2 = p1;
+
+  cout << "it is '" << p1.c_str() << "'" << endl;
+
+  bufferptr p3 = new buffer("abcdef",6);
+  
+  cout << "p3 is " << p3 << endl;
+
+  bufferlist bl;
+  bl.push_back(p2);
+  bl.push_back(p1);
+  bl.push_back(p3);
+
+  cout << "bl is " << bl << endl;
+
+  cout << "len is " << bl.length() << endl;
+
+  bufferlist took;
+  bl.splice(10,4,&took);
+
+  cout << "took out " << took << "leftover is " << bl << endl;
+  //cout << "len is " << bl.length() << endl;
+
+  bufferlist bl2;
+  bl2.substr_of(bl, 3, 5);
+  cout << "bl2 is " << bl2 << endl;
+  
+
+}
diff --git a/branches/sage/cephmds2/test/testcrush.cc b/branches/sage/cephmds2/test/testcrush.cc
new file mode 100644
index 0000000000000..bd432b23ee95c
--- /dev/null
+++ b/branches/sage/cephmds2/test/testcrush.cc
@@ -0,0 +1,266 @@
+
+
+#include "../crush/crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+/*
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+  out << "[";
+  for (int i=0; i<v.size(); i++) {
+    if (i) out << " ";
+    out << v[i];
+  }
+  out << "]";
+  return out;
+}
+*/
+
+void make_disks(int n, int& no, vector<int>& d) 
+{
+  d.clear();
+  while (n) {
+    d.push_back(no);
+    no++;
+    n--;
+  }
+}
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks, int& nbuckets)
+{
+  if (h == 0) {
+    // uniform
+    Hash hash(123);
+    vector<int> disks;
+    for (int i=0; i<wid[h]; i++)
+      disks.push_back(ndisks++);
+    UniformBucket *b = new UniformBucket(nbuckets--, 1, 0, 10, disks);
+    b->make_primes(hash);  
+    c.add_bucket(b);
+    //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+    return b;
+  } else {
+    // mixed
+    MixedBucket *b = new MixedBucket(nbuckets--, h+1);
+    for (int i=0; i<wid[h]; i++) {
+      Bucket *n = make_bucket(c, wid, h-1, ndisks, nbuckets);
+      b->add_item(n->get_id(), n->get_weight());
+    }
+    c.add_bucket(b);
+    //cout << h << " mixedbucket with " << wid[h] << endl;
+    return b;
+  }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks, int& nbuckets)
+{
+  Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets);
+  return b->get_id();
+}
+
+
+
+int main() 
+{
+  Hash h(73232313);
+
+  // crush
+  Crush c;
+
+
+  // buckets
+  vector<int> disks;
+  int root = -1;
+  int nbuckets = -1;
+  int ndisks = 0;
+  
+  if (0) {
+    make_disks(12, ndisks, disks);
+    UniformBucket ub1(-1, 1, 0, 30, disks);
+    ub1.make_primes(h);
+    cout << "ub1 primes are " << ub1.primes << endl;
+    c.add_bucket(&ub1);
+    
+    make_disks(17, ndisks, disks);
+    UniformBucket ub2(-2, 1, 0, 30, disks);
+    ub2.make_primes(h);  
+    cout << "ub2 primes are " << ub2.primes << endl;
+    c.add_bucket(&ub2);
+    
+    make_disks(4, ndisks, disks);
+    UniformBucket ub3(-3, 1, 0, 30, disks);
+    ub3.make_primes(h);  
+    cout << "ub3 primes are " << ub3.primes << endl;
+    c.add_bucket(&ub3);
+    
+    make_disks(20, ndisks, disks);
+    MixedBucket umb1(-4, 1);
+    for (int i=0; i<20; i++)
+      umb1.add_item(disks[i], 30);
+    c.add_bucket(&umb1);
+    
+    MixedBucket b(-100, 1);
+    //b.add_item(-2, ub1.get_weight());
+    b.add_item(-4, umb1.get_weight());
+    //b.add_item(-2, ub2.get_weight());
+    //b.add_item(-3, ub3.get_weight());
+  }
+
+  if (0) {
+    int bucket = -1;
+    MixedBucket *root = new MixedBucket(bucket--, 2);
+
+    for (int i=0; i<5; i++) {
+      MixedBucket *b = new MixedBucket(bucket--, 1);
+
+      int n = 5;
+
+      if (1) {
+        // add n buckets of n disks
+        for (int j=0; j<n; j++) {
+          
+          MixedBucket *d = new MixedBucket(bucket--, 1);
+          
+          make_disks(n, ndisks, disks);
+          for (int k=0; k<n; k++)
+            d->add_item(disks[k], 10);
+          
+          //b->add_item(disks[j], 10);
+          c.add_bucket(d);
+          b->add_item(d->get_id(), d->get_weight());
+        }
+        
+        c.add_bucket(b);
+        root->add_item(b->get_id(), b->get_weight());
+      } else {
+        // add n*n disks
+        make_disks(n*n, ndisks, disks);
+        for (int k=0; k<n*n; k++)
+          b->add_item(disks[k], 10);
+
+        c.add_bucket(b);
+        root->add_item(b->get_id(), b->get_weight());
+      }
+    }
+
+    c.add_bucket(root);
+  }
+
+
+  if (1) {
+    vector<int> wid;
+    for (int d=0; d<5; d++)
+      wid.push_back(10);
+    root = make_hierarchy(c, wid, ndisks, nbuckets);
+  }
+  
+
+
+  // rule
+  int numrep = 1;
+
+  Rule rule;
+  if (0) {
+    rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100));
+    rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+  }
+  if (1) {
+    /*
+    rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4));
+    rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0));
+    rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+    */
+    rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+    rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
+    rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+  }
+
+  //c.overload[10] = .1;
+
+
+  int pg_per = 100;
+  int numpg = pg_per*ndisks/numrep;
+  
+  vector<int> ocount(ndisks);
+  cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl;
+  cout << pg_per << " pgs per disk" << endl;
+  cout << numpg << " logical pgs" << endl;
+  cout << "numrep is " << numrep << endl;
+
+
+  int place = 1000000;
+  int times = place / numpg;
+  if (!times) times = 1;
+
+  cout << "looping " << times << " times" << endl;
+  
+  float tvar = 0;
+  int tvarnum = 0;
+
+  int x = 0;
+  for (int t=0; t<times; t++) {
+    vector<int> v(numrep);
+    
+    for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+    for (int xx=1; xx<numpg; xx++) {
+      x++;
+
+      //cout << H(x) << "\t" << h(x) << endl;
+      c.do_rule(rule, x, v);
+      //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << "  " << v[2] << endl;
+      
+      bool bad = false;
+      for (int i=0; i<numrep; i++) {
+        //int d = b.choose_r(x, i, h);
+        //v[i] = d;
+        ocount[v[i]]++;
+        for (int j=i+1; j<numrep; j++) {
+          if (v[i] == v[j]) 
+            bad = true;
+        }
+      }
+      if (bad)
+        cout << "bad set " << x << ": " << v << endl;
+      
+      //cout << v << "\t" << ocount << endl;
+    }
+    
+    /*
+      for (int i=0; i<ocount.size(); i++) {
+      cout << "disk " << i << " has " << ocount[i] << endl;
+      }
+    */
+    
+    cout << "collisions: " << c.collisions << endl;
+    cout << "r bumps: " << c.bumps << endl;
+    
+    
+    float avg = 0.0;
+    for (int i=0; i<ocount.size(); i++)
+      avg += ocount[i];
+    avg /= ocount.size();
+    float var = 0.0;
+    for (int i=0; i<ocount.size(); i++)
+      var += (ocount[i] - avg) * (ocount[i] - avg);
+    var /= ocount.size();
+    
+    cout << "avg " << avg << "  var " << var << "   sd " << sqrt(var) << endl;
+    
+    tvar += var;
+    tvarnum++;
+  }
+
+  tvar /= tvarnum;
+
+  cout << "total variance " << tvar << endl;
+
+
+}
diff --git a/branches/sage/cephmds2/test/testfilepath.cc b/branches/sage/cephmds2/test/testfilepath.cc
new file mode 100644
index 0000000000000..ac21e106963d9
--- /dev/null
+++ b/branches/sage/cephmds2/test/testfilepath.cc
@@ -0,0 +1,22 @@
+
+#include "include/filepath.h"
+#include <iostream>
+using namespace std;
+
+int print(string s) {
+  filepath fp = s;
+  cout << "s = " << s << "   filepath = " << fp << endl;
+  cout << "  depth " << fp.depth() << endl;
+  for (int i=0; i<fp.depth(); i++) {
+    cout << "\t" << i << " " << fp[i] << endl;
+  }
+}
+
+int main() {
+  filepath p;
+  print("/home/sage");
+  print("a/b/c");
+  print("/a/b/c");
+  print("/a/b/c/");
+  print("/a/b/../d");
+}
diff --git a/branches/sage/cephmds2/test/testmpi.cc b/branches/sage/cephmds2/test/testmpi.cc
new file mode 100644
index 0000000000000..3d0125992d257
--- /dev/null
+++ b/branches/sage/cephmds2/test/testmpi.cc
@@ -0,0 +1,53 @@
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+#include "messages/MPing.h"
+#include "common/Mutex.h"
+
+#include "msg/MPIMessenger.h"
+
+class Pinger : public Dispatcher {
+public:
+  Messenger *messenger;
+  Pinger(Messenger *m) : messenger(m) {
+    m->set_dispatcher(this);
+  }
+  void dispatch(Message *m) {
+    //dout(1) << "got incoming " << m << endl;
+    delete m;
+
+  }
+};
+
+int main(int argc, char **argv) {
+  int num = 1000;
+
+  int myrank = mpimessenger_init(argc, argv);
+  int world = mpimessenger_world();
+  
+  Pinger *p = new Pinger( new MPIMessenger(myrank) );
+
+  mpimessenger_start();
+
+  //while (1) {
+  for (int i=0; i<10000; i++) {
+    
+    // ping random nodes
+    int d = rand() % world;
+    if (d != myrank) {
+      //cout << "sending " << i << " to " << d << endl;
+      p->messenger->send_message(new MPing(), d);
+     }
+    
+  }
+
+
+  //cout << "shutting down" << endl;
+  //p->messenger->shutdown();
+  
+  mpimessenger_wait();
+  mpimessenger_shutdown();  // shutdown MPI
+}
diff --git a/branches/sage/cephmds2/test/testnewbuffers.cc b/branches/sage/cephmds2/test/testnewbuffers.cc
new file mode 100644
index 0000000000000..0fea7571a4572
--- /dev/null
+++ b/branches/sage/cephmds2/test/testnewbuffers.cc
@@ -0,0 +1,91 @@
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+
+#include "include/newbuffer.h"
+//#include "include/bufferlist.h"
+
+#include "common/Thread.h"
+
+
+  class Th : public Thread {
+  public:
+	bufferlist bl;
+	Th(bufferlist& o) : bl(o) { }
+	
+	void *entry() {
+	  //cout << "start" << endl;
+	  // thrash it a bit.
+	  for (int n=0; n<10000; n++) {
+		bufferlist bl2;
+		unsigned off = rand() % (bl.length() -1);
+		unsigned len = 1 + rand() % (bl.length() - off - 1);
+		bl2.substr_of(bl, off, len);
+		bufferlist bl3;
+		bl3.append(bl);
+		bl3.append(bl2);
+		//cout << bl3 << endl;
+		bl2.clear();
+		bl3.clear();
+	  }
+	  //cout << "end" << endl;
+	}
+  };
+
+int main()
+{
+
+  bufferptr p1 = buffer::copy("123456",7);
+  //bufferptr p1 = new buffer("123456",7);
+  bufferptr p2 = p1;
+
+  cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl;
+  cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl;
+
+  bufferptr p3 = buffer::copy("abcdef",7);
+  //bufferptr p3 = new buffer("abcdef",7);
+  
+  cout << "p3 is " << p3.c_str() << " " << p3 << endl;
+
+  bufferlist bl;
+  bl.push_back(p2);
+  bl.push_back(p1);
+  bl.push_back(p3);
+
+  cout << "bl is " << bl << endl;
+
+  bufferlist took;
+  bl.splice(10,4,&took);
+
+  cout << "took out " << took << ", leftover is " << bl << endl;
+  //cout << "len is " << bl.length() << endl;
+
+  bufferlist bl2;
+  bl2.substr_of(bl, 3, 5);
+  cout << "bl2 is " << bl2 << endl;
+
+
+  cout << "bl before " << bl << endl;
+
+  list<Th*> ls;
+  for (int t=0; t<40; t++) {
+	Th *t = new Th(bl);
+	cout << "create" << endl;
+	t->create();
+	ls.push_back(t);
+  }
+
+  bl.clear();
+
+  while (!ls.empty()) {
+	cout << "join" << endl;
+	ls.front()->join();
+	delete ls.front();
+	ls.pop_front();
+  }
+
+  cout << "bl after " << bl << endl;
+
+}
diff --git a/branches/sage/cephmds2/test/testtree.cc b/branches/sage/cephmds2/test/testtree.cc
new file mode 100644
index 0000000000000..2c21bcbe52e25
--- /dev/null
+++ b/branches/sage/cephmds2/test/testtree.cc
@@ -0,0 +1,46 @@
+
+
+#include "../crush/BinaryTree.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int main() 
+{
+  BinaryTree t;
+
+  vector<int> nodes;
+
+  for (int i=0; i<30; i++) {
+    cout << "adding " << i << endl;
+    int n = t.add_node(1);
+    nodes.push_back(n);
+    //cout << t << endl;
+  }
+  cout << t << endl;
+
+  for (int k=0; k<10000; k++) {
+    if (rand() % 2) {
+      cout << "adding" << endl;
+      nodes.push_back( t.add_node(1) );
+    } else {
+      if (!nodes.empty()) {
+        //for (int i=0; i<nodes.size(); i++) {
+        int p = rand() % nodes.size();
+        int n = nodes[p];
+        assert (t.exists(n));
+        cout << "removing " << n << endl;
+        t.remove_node(n);
+        
+        for (int j=p; j<nodes.size(); j++)
+          nodes[j] = nodes[j+1];
+        nodes.pop_back();
+      }
+    }
+    cout << t << endl;
+  }
+
+
+}
diff --git a/branches/sage/cephmds2/test/testxattr.cc b/branches/sage/cephmds2/test/testxattr.cc
new file mode 100644
index 0000000000000..210eef208a879
--- /dev/null
+++ b/branches/sage/cephmds2/test/testxattr.cc
@@ -0,0 +1,31 @@
+
+#include <iostream>
+using namespace std;
+
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/xattr.h>
+
+int main(int argc, char**argv)
+{
+  int a = 1;
+  int b = 2;
+
+  mknod("test", 0600, 0);
+
+  cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl;
+  cout << "errno " << errno << " " << strerror(errno) << endl;
+  cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl;
+  cout << "errno " << errno << " " << strerror(errno) << endl;
+  cout << "a is " << a << " and b is " << b << endl;
+  return 0;
+}
-- 
2.39.5