From 6b365d79e812f16f9293046cd7ec62474212d713 Mon Sep 17 00:00:00 2001 From: sageweil Date: Wed, 16 May 2007 21:53:22 +0000 Subject: [PATCH] merged branches/sage/cephmds2 into trunk/ceph git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1359 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/Makefile | 27 +- trunk/ceph/TODO | 226 +- trunk/ceph/client/Client.cc | 669 ++- trunk/ceph/client/Client.h | 85 +- trunk/ceph/client/SyntheticClient.cc | 144 +- trunk/ceph/client/SyntheticClient.h | 16 +- trunk/ceph/client/fuse.cc | 2 +- trunk/ceph/cmonctl.cc | 91 + trunk/ceph/common/Clock.h | 105 +- trunk/ceph/common/Thread.h | 36 +- trunk/ceph/common/Timer.h | 4 - trunk/ceph/config.cc | 23 +- trunk/ceph/config.h | 3 + trunk/ceph/doc/Replication.txt | 19 - trunk/ceph/doc/anchortable.txt | 54 + trunk/ceph/doc/caching.txt | 396 +- trunk/ceph/doc/exports.txt | 72 + trunk/ceph/doc/mds_locks.txt | 66 + trunk/ceph/doc/performance.txt | 36 - trunk/ceph/ebofs/Ebofs.h | 5 - trunk/ceph/include/Context.h | 6 + trunk/ceph/include/buffer.h | 443 +- trunk/ceph/include/filepath.h | 159 +- trunk/ceph/include/frag.h | 253 + trunk/ceph/include/object.h | 3 + trunk/ceph/include/reqid.h | 64 - trunk/ceph/include/types.h | 127 +- trunk/ceph/include/utime.h | 132 + trunk/ceph/mds/Anchor.h | 82 +- trunk/ceph/mds/AnchorClient.cc | 312 +- trunk/ceph/mds/AnchorClient.h | 69 +- trunk/ceph/mds/AnchorTable.cc | 677 ++- trunk/ceph/mds/AnchorTable.h | 107 +- trunk/ceph/mds/CDentry.cc | 92 +- trunk/ceph/mds/CDentry.h | 202 +- trunk/ceph/mds/CDir.cc | 1082 +++- trunk/ceph/mds/CDir.h | 459 +- trunk/ceph/mds/CInode.cc | 412 +- trunk/ceph/mds/CInode.h | 431 +- trunk/ceph/mds/Capability.h | 6 +- trunk/ceph/mds/ClientMap.h | 156 +- trunk/ceph/mds/FileLock.h | 224 + trunk/ceph/mds/Hasher.cc | 1580 +++++ trunk/ceph/mds/Lock.h | 321 -- trunk/ceph/mds/Locker.cc | 2267 ++++---- trunk/ceph/mds/Locker.h | 106 +- trunk/ceph/mds/LogEvent.cc | 42 +- trunk/ceph/mds/LogEvent.h | 26 +- trunk/ceph/mds/MDBalancer.cc | 139 +- trunk/ceph/mds/MDCache.cc | 5064 ++++++++++------- trunk/ceph/mds/MDCache.h | 407 +- trunk/ceph/mds/MDLog.cc | 102 +- trunk/ceph/mds/MDLog.h | 52 +- trunk/ceph/mds/MDS.cc | 333 +- trunk/ceph/mds/MDS.h | 54 +- trunk/ceph/mds/MDSMap.h | 83 +- trunk/ceph/mds/MDStore.cc | 752 --- trunk/ceph/mds/MDStore.h | 75 - trunk/ceph/mds/Migrator.cc | 3743 ++++-------- trunk/ceph/mds/Migrator.h | 241 +- trunk/ceph/mds/Renamer.cc | 98 +- trunk/ceph/mds/ScatterLock.h | 99 + trunk/ceph/mds/Server.cc | 3598 +++++++----- trunk/ceph/mds/Server.h | 199 +- trunk/ceph/mds/SimpleLock.h | 267 + trunk/ceph/mds/events/EAnchor.h | 81 + trunk/ceph/mds/events/EAnchorClient.h | 57 + trunk/ceph/mds/events/EClientMap.h | 58 + .../mds/events/{EExportStart.h => EExport.h} | 41 +- trunk/ceph/mds/events/EExportFinish.h | 59 - trunk/ceph/mds/events/EImportFinish.h | 12 +- trunk/ceph/mds/events/EImportMap.h | 29 +- trunk/ceph/mds/events/EImportStart.h | 18 +- trunk/ceph/mds/events/EMetaBlob.h | 187 +- trunk/ceph/mds/events/EOpen.h | 52 + trunk/ceph/mds/events/EPurgeFinish.h | 11 +- trunk/ceph/mds/events/ESession.h | 63 + trunk/ceph/mds/events/ESlaveUpdate.h | 60 + trunk/ceph/mds/events/EUnlink.h | 71 - trunk/ceph/mds/journal.cc | 729 ++- trunk/ceph/mds/mdstypes.h | 324 +- .../messages/{MAnchorRequest.h => MAnchor.h} | 55 +- trunk/ceph/messages/MAnchorReply.h | 74 - trunk/ceph/messages/MCacheExpire.h | 116 +- trunk/ceph/messages/MClientFileCaps.h | 66 +- trunk/ceph/messages/MClientInodeAuthUpdate.h | 46 - trunk/ceph/messages/MClientMount.h | 15 +- trunk/ceph/messages/MClientMountAck.h | 59 - trunk/ceph/messages/MClientReconnect.h | 71 + trunk/ceph/messages/MClientReply.h | 117 +- trunk/ceph/messages/MClientRequest.h | 245 +- trunk/ceph/messages/MClientRequestForward.h | 58 + trunk/ceph/messages/MClientSession.h | 55 + .../{MClientBoot.h => MClientUnmount.h} | 16 +- trunk/ceph/messages/MDentryUnlink.h | 62 +- trunk/ceph/messages/MDirExpire.h | 50 - trunk/ceph/messages/MDirExpireReq.h | 49 - trunk/ceph/messages/MDirUpdate.h | 37 +- trunk/ceph/messages/MDiscover.h | 64 +- trunk/ceph/messages/MDiscoverReply.h | 103 +- trunk/ceph/messages/MExportDir.h | 29 +- trunk/ceph/messages/MExportDirAck.h | 27 +- trunk/ceph/messages/MExportDirCancel.h | 48 + trunk/ceph/messages/MExportDirDiscover.h | 40 +- trunk/ceph/messages/MExportDirDiscoverAck.h | 39 +- trunk/ceph/messages/MExportDirFinish.h | 26 +- trunk/ceph/messages/MExportDirNotify.h | 125 +- trunk/ceph/messages/MExportDirNotifyAck.h | 23 +- trunk/ceph/messages/MExportDirPrep.h | 130 +- trunk/ceph/messages/MExportDirPrepAck.h | 26 +- trunk/ceph/messages/MExportDirWarning.h | 22 +- trunk/ceph/messages/MExportDirWarningAck.h | 44 + trunk/ceph/messages/MFailure.h | 49 - trunk/ceph/messages/MFailureAck.h | 42 - trunk/ceph/messages/MHeartbeat.h | 40 +- trunk/ceph/messages/MInodeExpire.h | 50 - trunk/ceph/messages/MInodeFileCaps.h | 17 +- trunk/ceph/messages/MInodeLink.h | 67 +- trunk/ceph/messages/MInodeLinkAck.h | 9 +- trunk/ceph/messages/MInodeUnlink.h | 9 +- trunk/ceph/messages/MInodeUnlinkAck.h | 9 +- trunk/ceph/messages/MInodeUpdate.h | 61 - trunk/ceph/messages/MLock.h | 90 +- trunk/ceph/messages/MMDSCacheRejoin.h | 158 +- trunk/ceph/messages/MMDSCacheRejoinAck.h | 47 +- trunk/ceph/messages/MMDSImportMap.h | 12 +- trunk/ceph/messages/MMDSMap.h | 16 +- .../messages/{MNSLookup.h => MMonCommand.h} | 37 +- trunk/ceph/messages/MMonCommandAck.h | 45 + trunk/ceph/messages/MNSConnect.h | 45 - trunk/ceph/messages/MNSConnectAck.h | 53 - trunk/ceph/messages/MNSFailure.h | 52 - trunk/ceph/messages/MNSLookupReply.h | 44 - trunk/ceph/messages/MNSRegister.h | 59 - trunk/ceph/messages/MNSRegisterAck.h | 53 - trunk/ceph/messages/MOSDOp.h | 23 +- trunk/ceph/messages/MOSDOpReply.h | 14 +- trunk/ceph/messages/MPing.h | 9 +- trunk/ceph/messages/MPingAck.h | 9 +- trunk/ceph/messages/MRenameAck.h | 9 +- trunk/ceph/messages/MRenameNotify.h | 33 +- trunk/ceph/messages/MRenameNotifyAck.h | 9 +- trunk/ceph/messages/MRenamePrep.h | 37 +- trunk/ceph/messages/MRenameReq.h | 33 +- trunk/ceph/messages/MRenameWarning.h | 9 +- trunk/ceph/mon/ClientMonitor.cc | 62 +- trunk/ceph/mon/ClientMonitor.h | 5 +- trunk/ceph/mon/MDSMonitor.cc | 69 +- trunk/ceph/mon/MDSMonitor.h | 4 + trunk/ceph/mon/Monitor.cc | 42 +- trunk/ceph/mon/Monitor.h | 3 + trunk/ceph/mon/OSDMonitor.h | 4 + trunk/ceph/msg/FakeMessenger.cc | 9 +- trunk/ceph/msg/MPIMessenger.cc | 608 -- trunk/ceph/msg/MPIMessenger.h | 56 - trunk/ceph/msg/MTMessenger.cc | 197 - trunk/ceph/msg/MTMessenger.h | 50 - trunk/ceph/msg/Message.cc | 178 +- trunk/ceph/msg/Message.h | 135 +- trunk/ceph/msg/NewMessenger.cc | 1714 ------ trunk/ceph/msg/NewMessenger.h | 305 - trunk/ceph/msg/NewerMessenger.cc | 1791 ------ trunk/ceph/msg/NewerMessenger.h | 343 -- trunk/ceph/msg/SimpleMessenger.cc | 30 +- trunk/ceph/msg/SimpleMessenger.h | 8 +- trunk/ceph/msg/TCPDirectory.cc | 178 - trunk/ceph/msg/TCPDirectory.h | 110 - trunk/ceph/msg/TCPMessenger.cc | 1454 ----- trunk/ceph/msg/TCPMessenger.h | 115 - trunk/ceph/msg/error.c | 77 - trunk/ceph/msg/msg_types.h | 5 +- trunk/ceph/osd/PG.h | 8 +- trunk/ceph/osd/osd_types.h | 49 +- trunk/ceph/osdc/Filer.h | 1 - trunk/ceph/osdc/Objecter.cc | 10 +- trunk/ceph/osdc/Objecter.h | 2 +- trunk/ceph/script/check_cache_dumps.pl | 56 + 177 files changed, 18722 insertions(+), 21219 deletions(-) create mode 100644 trunk/ceph/cmonctl.cc delete mode 100644 trunk/ceph/doc/Replication.txt create mode 100644 trunk/ceph/doc/anchortable.txt create mode 100644 trunk/ceph/doc/exports.txt create mode 100644 trunk/ceph/doc/mds_locks.txt delete mode 100644 trunk/ceph/doc/performance.txt create mode 100644 trunk/ceph/include/frag.h delete mode 100644 trunk/ceph/include/reqid.h create mode 100644 trunk/ceph/include/utime.h create mode 100644 trunk/ceph/mds/FileLock.h create mode 100644 trunk/ceph/mds/Hasher.cc delete mode 100644 trunk/ceph/mds/Lock.h delete mode 100644 trunk/ceph/mds/MDStore.cc delete mode 100644 trunk/ceph/mds/MDStore.h create mode 100644 trunk/ceph/mds/ScatterLock.h create mode 100644 trunk/ceph/mds/SimpleLock.h create mode 100644 trunk/ceph/mds/events/EAnchor.h create mode 100644 trunk/ceph/mds/events/EAnchorClient.h create mode 100644 trunk/ceph/mds/events/EClientMap.h rename trunk/ceph/mds/events/{EExportStart.h => EExport.h} (57%) delete mode 100644 trunk/ceph/mds/events/EExportFinish.h create mode 100644 trunk/ceph/mds/events/EOpen.h create mode 100644 trunk/ceph/mds/events/ESession.h create mode 100644 trunk/ceph/mds/events/ESlaveUpdate.h delete mode 100644 trunk/ceph/mds/events/EUnlink.h rename trunk/ceph/messages/{MAnchorRequest.h => MAnchor.h} (50%) delete mode 100644 trunk/ceph/messages/MAnchorReply.h delete mode 100644 trunk/ceph/messages/MClientInodeAuthUpdate.h delete mode 100644 trunk/ceph/messages/MClientMountAck.h create mode 100644 trunk/ceph/messages/MClientReconnect.h create mode 100644 trunk/ceph/messages/MClientRequestForward.h create mode 100644 trunk/ceph/messages/MClientSession.h rename trunk/ceph/messages/{MClientBoot.h => MClientUnmount.h} (70%) delete mode 100644 trunk/ceph/messages/MDirExpire.h delete mode 100644 trunk/ceph/messages/MDirExpireReq.h create mode 100644 trunk/ceph/messages/MExportDirCancel.h create mode 100644 trunk/ceph/messages/MExportDirWarningAck.h delete mode 100644 trunk/ceph/messages/MFailure.h delete mode 100644 trunk/ceph/messages/MFailureAck.h delete mode 100644 trunk/ceph/messages/MInodeExpire.h delete mode 100644 trunk/ceph/messages/MInodeUpdate.h rename trunk/ceph/messages/{MNSLookup.h => MMonCommand.h} (53%) create mode 100644 trunk/ceph/messages/MMonCommandAck.h delete mode 100644 trunk/ceph/messages/MNSConnect.h delete mode 100644 trunk/ceph/messages/MNSConnectAck.h delete mode 100644 trunk/ceph/messages/MNSFailure.h delete mode 100644 trunk/ceph/messages/MNSLookupReply.h delete mode 100644 trunk/ceph/messages/MNSRegister.h delete mode 100644 trunk/ceph/messages/MNSRegisterAck.h delete mode 100644 trunk/ceph/msg/MPIMessenger.cc delete mode 100644 trunk/ceph/msg/MPIMessenger.h delete mode 100644 trunk/ceph/msg/MTMessenger.cc delete mode 100644 trunk/ceph/msg/MTMessenger.h delete mode 100644 trunk/ceph/msg/NewMessenger.cc delete mode 100644 trunk/ceph/msg/NewMessenger.h delete mode 100644 trunk/ceph/msg/NewerMessenger.cc delete mode 100644 trunk/ceph/msg/NewerMessenger.h delete mode 100644 trunk/ceph/msg/TCPDirectory.cc delete mode 100644 trunk/ceph/msg/TCPDirectory.h delete mode 100644 trunk/ceph/msg/TCPMessenger.cc delete mode 100644 trunk/ceph/msg/TCPMessenger.h delete mode 100644 trunk/ceph/msg/error.c create mode 100755 trunk/ceph/script/check_cache_dumps.pl diff --git a/trunk/ceph/Makefile b/trunk/ceph/Makefile index e8fb11534a48b..6248f207959d4 100644 --- a/trunk/ceph/Makefile +++ b/trunk/ceph/Makefile @@ -1,24 +1,14 @@ -# mpicxx must be on your path to build newsyn. on googoo, this means -# that /usr/local/mpich2-1.0.2/bin must be on your path. - -# For now, use g++ most of the time. -# When compiling MPI stuff, specify myfile.cc instead of myfile.o so -# that ${MPICC} is invoked instead of the generic .o rule (or it'll -# use g++). This makes it less annoying to build on non-mpi hosts for -# dev work, and seems to behave just fine... change ${CC} back to -# mpicxx if you get paranoid. - -#CC = g++ -#CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -#LIBS = -lpthread +# mpicxx must be on your path to build newsyn. +# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. +# on issdm, it's /usr/local/mpich2/bin. # Hook for extra -I options, etc. EXTRA_CFLAGS = ifeq ($(target),darwin) # For Darwin -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} LDINC = ar -rc else # For linux @@ -36,7 +26,7 @@ endif #for normal mpich2 machines MPICC = mpicxx -MPICFLAGS = ${CFLAGS} +MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} MPILIBS = ${LIBS} #for LLNL boxes without mpicxx @@ -57,14 +47,12 @@ MDS_OBJS= \ mds/MDCache.o\ mds/Locker.o\ mds/Migrator.o\ - mds/Renamer.o\ mds/MDBalancer.o\ mds/CDentry.o\ mds/CDir.o\ mds/CInode.o\ mds/AnchorTable.o\ mds/AnchorClient.o\ - mds/MDStore.o\ mds/LogEvent.o\ mds/IdAllocator.o\ mds/MDLog.o @@ -111,7 +99,7 @@ OSBDB_OBJS = \ OSBDB_OBJ = osbdb.o endif -TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cfuse fakefuse +TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cmonctl cfuse fakefuse NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap @@ -133,6 +121,9 @@ mkmonmap: mkmonmap.cc common.o cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ +cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o + ${CC} ${CFLAGS} ${LIBS} $^ -o $@ + cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ diff --git a/trunk/ceph/TODO b/trunk/ceph/TODO index 8a64da39dfc8a..3cbaade473603 100644 --- a/trunk/ceph/TODO +++ b/trunk/ceph/TODO @@ -1,4 +1,129 @@ +sage + mds diropen +doc +- mdsmonitor beacon semantics +- cache expiration, cache invariants + - including dual expire states, transition, vs subtree grouping of expire messages +- recovery states, implicit barrier are rejoin +- journal content + - importmaps and up:resolve +- metablob version semantics + + + +mds +- finish multistage rejoin + +- more testing of failures + thrashing. + - is export prep dir open deadlock properly fixed by forge_replica_dir()? + +- locker vs node failure +- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. +- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! +- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) +- incremental mdsmaps? +- client failure + +- dirfrag split + - make sure we are freezing _before_ we fetch to complete the dirfrag, else + we break commit()'s preconditions when it fetches an incomplete dir. + +- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) +- dir version/committed/etc versus migration, log expires. + - DOCUMENT. + +- fix rmdir empty exported dirfrag race + - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. + - how to know full dir size (when trimming)? + - put frag size/mtime in fragmap in inode? we will need that anyway for stat on dirs + - will need to make inode discover/import_decode smart about dirfrag auth + - or, only put frag size/mtime in inode when frag is closed. otherwise, soft (journaled) state, possibly on another mds. + - need to move state from replicas to auth. simplelock doesn't currently support that. + - ScatterLock or something? hrm. + +- test open_remote_ino + +- scatterlock + - unlink, link, rename need to pre_dirty and update dir inode's mtime + - tho need to make sure that behaves when dirfrag's inode is non-auth... + +- FIXME how to journal root and stray inode content? + - in particular, i care about dirfragtree.. get it on rejoin? + - and dir sizes, if i add that... also on rejoin? + +/- properly recover lock state on rejoin... +/ - recovering mds rejoins replicas it pulled out of its journal +/ - replicas will tell it when they hold an xlock +/ - surviving mds rejoins replicas from a recovering mds +/ - will tell auth if it holds an xlock +- send_rejoin_acks + +- recovering open files + - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state. + - path+cap window will require some fetching of metadata from disk before doing the rejoin + - failures during migration.. what about client stale/reap stuff and misplaced WR caps? + +- inode.max_size + +- journal+recovery + - local rename + - how to notify replicas... +/ - stray purge + - stray reintegration + - remote link + - impl remote inode xlock + - ESlaveUpdate replay, resolution, etc. + - remote unlink + - rewrite to look link _link + - remote rename + - file capabilities i/o +- filelock to control directory mtime, dentry changes + - hmm, may have to change lock ordering, and Server::rdlock_path_pin_ref() +- dirfrag split/merge + - client readdir for dirfrags +- consistency points/snapshots + - dentry versions vs dirfrags... +- real chdir (directory "open") + - relative metadata ops +- statfs? + + +- fix lock caps gather ack versus ambiguous auth + + +foreign rename +- question: can we generalize foreign and local rename? +- initiated by dest. + - if we get into race with lock acquisition, drop locks and forward to new dest. +- how to do pre-auth pinning? + - is it sufficient to wait on, then grab, all local auth pins, _then_ do foreign locks? + - local auth pins can hold subtrees in freezing state, preventing exports, and additional auth_pins. + - so, wait, then grab all local auth_pins, + - then work on locks in proper order (*), + - if we detect we are missing a local auth_pin (i.e. migration race), drop all auth_pins and wait/restart + - need to more carefully look at lock dependencies to avoid deadlock... + - establish a complete full ordering on locks, based on any lock dependencies? + - is it possible to "leak" locks, e.g. get inode_hard lock, work on something else, but inode moves and we dont notice? + - pin paths for those locks? + - can we pin when we choose order, so that locks are sure to proceed? +- we can change active_requests to key of reqid (also unique), and use the same key for foreign locks + - clean up dentry_xlock_request.. just merge it into destroy_xlock_start, if !is_auth(). +- renamer will + - check preconditions (i.e. i am dest) + - grab all locks (avoiding deadlock) + - verify preconditions are still true, else forward/retry (actually, this already happens w/ the way we structure the lock acquisition code...) + - prepare foreign bits (using foreign request_auth_pins, locks, etc.) + - source unlink, + - anchortable update (if source is anchored), + - dest nlink-- (if dest is remote link on foreign host) + - make sure replicas have either both source+dest pinned in cache (or neither...) + - use foreign request_pins? + - log update + - do update locally + - async commit + unlock +- rejoin will need to explicitly resolve uncommitted items. + - fully implement link/unlink first, and use that as a model? monitor - finish generic paxos @@ -58,12 +183,8 @@ rados snapshots - how to get usage feedback to monitor? -- change messenger entity_inst_t - - no more rank! make it a uniquish nonce? - - clean up mds caps release in exporter - figure out client failure modes -- clean up messenger failure modes. - add connection retry. @@ -87,7 +208,9 @@ osd/rados simplemessenger - close idle connections -- retry, timeout on connection or transmission failure +- buffer sent messages until a receive is acknowledged (handshake!) + - retry, timeout on connection or transmission failure +- exponential backoff on monitor resend attempts (actually, this should go outside the messenger!) objectcacher - ocacher caps transitions vs locks @@ -152,31 +275,12 @@ mds client - fstat -- make_request: cope with mds failure - mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. - test client caps migration w/ mds exports - some heuristic behavior to consolidate caps to inode auth? -MDS TODO -- fix hashed readdir: should (optionally) do a lock on dir namespace? -- fix hard links - - they mostly work, but they're fragile -- sync clients on stat - - will need to ditch 10s client metadata caching before this is useful - - implement truncate -- implement hashed directories -- statfs? -- rewrite journal + recovery -- figure out online failure recovery -- more distributed fh management? -- btree directories (for efficient large directories) -- consistency points/snapshots - -- fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already?) - @@ -246,77 +350,3 @@ IMPLEMENT - - - - -==== MDS RECOVERY ==== - -- how to reliably deliver cache expire messages? - - how should proxy behave? - - exporter failure - - all cacheexpire info has been passed on up until point where export is permanent. no impact. - - importer failure - - exporter collects expire info, so that it can reverse. - - ??? - - maybe hosts should double-up expires until after export is known to have committed? ---> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :) - -*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle! -- expire should go to both old and new auth -- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth -- does inode need it's own replica list? no! -- dirslices. - - -/- exporter recovery if importer fails during EXPORT_EXPORTING stage -- importer recovery if exporter fails - -/?- delay response to sending import_map if export in progress? -/?- finish export before sending import_map? -/- ambiguous imports on active node should include in-progress imports! -/- how to effectively trim cache after resolve but before rejoin -/ - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - -- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) - - -importmap only sent after exports have completed. -failures update export ack waitlists, so exports will compelte if unrelated nodes fail. -importmap can be sent regardless of import status -- pending import is just flagged ambiguous. -failure of exporter induces some cleanup on importer. importer will disambiguate when it gets an importmap on exporter recovery. -failure of importer induces cleanup on exporter. no ambiguity. - - -/- no new mds may join if cluster is in a recovery state. starting -> standby (unless failed) -/ - make sure creating -> standby, and are not included in recovery set? - - -mdsmap notes -- mds don't care about intervening states, except rejoin > active, and - that transition requires active involvement. thus, no need worry - about delivering/processing the full sequence of maps. - -blech: -- EMetablob should return 'expired' if they have - higher versions (and are thus described by a newer journal entry) - -mds -- mds falure vs clients - - clean up client op redirection - - idempotent ops - -- journal+recovery - - unlink - - open(wr cap), open+create - - file capabilities i/o - - link - - rename - -- should auth_pins really go to the root? - - FIXME: auth_pins on importer versus import beneath an authpinned region? - diff --git a/trunk/ceph/client/Client.cc b/trunk/ceph/client/Client.cc index 5a80d7d38bc6f..00874d206a4b8 100644 --- a/trunk/ceph/client/Client.cc +++ b/trunk/ceph/client/Client.cc @@ -31,10 +31,13 @@ using namespace std; // ceph stuff #include "Client.h" - -#include "messages/MClientBoot.h" #include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" +#include "messages/MClientUnmount.h" +#include "messages/MClientSession.h" +#include "messages/MClientReconnect.h" +#include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" +#include "messages/MClientReply.h" #include "messages/MClientFileCaps.h" #include "messages/MGenericMessage.h" @@ -120,6 +123,7 @@ Client::Client(Messenger *m, MonMap *mm) // osd interfaces osdmap = new OSDMap(); // initially blank.. see mount() objecter = new Objecter(messenger, monmap, osdmap); + objecter->set_client_incarnation(0); // client always 0, for now. objectcacher = new ObjectCacher(objecter, client_lock); filer = new Filer(objecter); } @@ -269,7 +273,6 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) dout(12) << "insert_inode " << dname << " ino " << st->inode.ino << " size " << st->inode.size << " mtime " << st->inode.mtime - << " hashed " << st->hashed << endl; if (dn) { @@ -353,20 +356,27 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) */ void Client::update_inode_dist(Inode *in, InodeStat *st) { - // dir info - in->dir_auth = st->dir_auth; - in->dir_hashed = st->hashed; - in->dir_replicated = st->replicated; + // auth + in->dir_auth = -1; + if (!st->dirfrag_auth.empty()) { // HACK FIXME ******* FIXME FIXME FIXME FIXME dirfrag_t + in->dir_auth = st->dirfrag_auth.begin()->second; + } + + // replicated + in->dir_replicated = false; + if (!st->dirfrag_rep.empty()) + in->dir_replicated = true; // FIXME - // dir replication - if (st->spec_defined) { - if (st->dist.empty() && !in->dir_contacts.empty()) + // dist + if (!st->dirfrag_dist.empty()) { // FIXME + set dist = st->dirfrag_dist.begin()->second; + if (dist.empty() && !in->dir_contacts.empty()) dout(9) << "lost dist spec for " << in->inode.ino - << " " << st->dist << endl; - if (!st->dist.empty() && in->dir_contacts.empty()) + << " " << dist << endl; + if (!dist.empty() && in->dir_contacts.empty()) dout(9) << "got dist spec for " << in->inode.ino - << " " << st->dist << endl; - in->dir_contacts = st->dist; + << " " << dist << endl; + in->dir_contacts = dist; } } @@ -378,7 +388,7 @@ void Client::update_inode_dist(Inode *in, InodeStat *st) Inode* Client::insert_trace(MClientReply *reply) { Inode *cur = root; - time_t now = time(NULL); + utime_t now = g_clock.real_now(); dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl; @@ -412,8 +422,10 @@ Inode* Client::insert_trace(MClientReply *reply) update_inode_dist(cur, *pin); // set cache ttl - if (g_conf.client_cache_stat_ttl) - cur->valid_until = now + g_conf.client_cache_stat_ttl; + if (g_conf.client_cache_stat_ttl) { + cur->valid_until = now; + cur->valid_until += g_conf.client_cache_stat_ttl; + } } return cur; @@ -459,13 +471,10 @@ Dentry *Client::lookup(filepath& path) // ------- -MClientReply *Client::make_request(MClientRequest *req, - bool auth_best, - int use_mds) // this param is purely for debug hacking +int Client::choose_target_mds(MClientRequest *req) { - // assign a unique tid - req->set_tid(++last_tid); - + int mds = 0; + // find deepest known prefix Inode *diri = root; // the deepest known containing dir Inode *item = 0; // the actual item... if we know it @@ -476,20 +485,20 @@ MClientReply *Client::make_request(MClientRequest *req, // dir? if (diri && diri->inode.mode & INODE_MODE_DIR && diri->dir) { Dir *dir = diri->dir; - + // do we have the next dentry? if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; + missing_dn = i; // no. + break; } dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl; - + if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; + item = dir->dentries[ req->get_filepath()[i] ]->inode; + break; } - + // continue.. diri = dir->dentries[ req->get_filepath()[i] ]->inode; assert(diri); @@ -498,54 +507,136 @@ MClientReply *Client::make_request(MClientRequest *req, break; } } - - // choose an mds - int mds = 0; + + // pick mds if (!diri || g_conf.client_use_random_mds) { // no root info, pick a random MDS mds = rand() % mdsmap->get_num_mds(); } else { - if (auth_best) { + if (req->auth_is_best()) { // pick the actual auth (as best we can) if (item) { - mds = item->authority(mdsmap); + mds = item->authority(mdsmap); } else if (diri->dir_hashed && missing_dn >= 0) { - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdsmap); } else { - mds = diri->authority(mdsmap); + mds = diri->authority(mdsmap); } } else { // balance our traffic! if (diri->dir_hashed && missing_dn >= 0) - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdsmap); else - mds = diri->pick_replica(mdsmap); + mds = diri->pick_replica(mdsmap); } } dout(20) << "mds is " << mds << endl; + + return mds; +} - // force use of a particular mds? - if (use_mds >= 0) mds = use_mds; +MClientReply *Client::make_request(MClientRequest *req, + int use_mds) // this param is purely for debug hacking +{ // time the call - utime_t start = g_clock.now(); + utime_t start = g_clock.real_now(); bool nojournal = false; int op = req->get_op(); if (op == MDS_OP_STAT || op == MDS_OP_LSTAT || op == MDS_OP_READDIR || - op == MDS_OP_OPEN || - op == MDS_OP_RELEASE) + op == MDS_OP_OPEN) nojournal = true; - MClientReply *reply = sendrecv(req, mds); + // -- request -- + // assign a unique tid + tid_t tid = ++last_tid; + req->set_tid(tid); + if (!mds_requests.empty()) + req->set_oldest_client_tid(mds_requests.begin()->first); + + // make note + MetaRequest request(req, tid); + mds_requests[tid] = &request; + + // encode payload now, in case we have to resend (in case of mds failure) + req->encode_payload(); + request.request_payload = req->get_payload(); + + // note idempotency + request.idempotent = req->is_idempotent(); + + // hack target mds? + if (use_mds) + request.resend_mds = use_mds; + + // set up wait cond + Cond cond; + request.caller_cond = &cond; + + while (1) { + // choose mds + int mds; + // force use of a particular mds? + if (request.resend_mds >= 0) { + mds = request.resend_mds; + request.resend_mds = -1; + dout(10) << "target resend_mds specified as mds" << mds << endl; + } else { + mds = choose_target_mds(req); + dout(10) << "chose target mds" << mds << " based on hierarchy" << endl; + } + + // open a session? + if (mds_sessions.count(mds) == 0) { + Cond cond; + if (waiting_for_session.count(mds) == 0) { + dout(10) << "opening session to mds" << mds << endl; + messenger->send_message(new MClientSession(MClientSession::OP_OPEN), + mdsmap->get_inst(mds), MDS_PORT_SERVER); + } + + // wait + waiting_for_session[mds].push_back(&cond); + while (waiting_for_session.count(mds)) { + dout(10) << "waiting for session to mds" << mds << " to open" << endl; + cond.Wait(client_lock); + } + } + + // send request. + send_request(&request, mds); + + // wait for signal + dout(20) << "awaiting kick on " << &cond << endl; + cond.Wait(client_lock); + + // did we get a reply? + if (request.reply) + break; + } + + // got it! + MClientReply *reply = request.reply; + + // kick dispatcher (we've got it!) + assert(request.dispatch_cond); + request.dispatch_cond->Signal(); + dout(20) << "sendrecv kickback on tid " << tid << " " << request.dispatch_cond << endl; + + // clean up. + mds_requests.erase(tid); + + + // -- log times -- if (client_logger) { - utime_t lat = g_clock.now(); + utime_t lat = g_clock.real_now(); lat -= start; dout(20) << "lat " << lat << endl; client_logger->finc("lsum",(double)lat); @@ -574,59 +665,138 @@ MClientReply *Client::make_request(MClientRequest *req, } -MClientReply* Client::sendrecv(MClientRequest *req, int mds) +void Client::handle_client_session(MClientSession *m) { - // NEW way. - Cond cond; - tid_t tid = req->get_tid(); - mds_rpc_cond[tid] = &cond; - - messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - // wait - while (mds_rpc_reply.count(tid) == 0) { - dout(20) << "sendrecv awaiting reply kick on " << &cond << endl; - cond.Wait(client_lock); + dout(10) << "handle_client_session " << *m << endl; + int from = m->get_source().num(); + + switch (m->op) { + case MClientSession::OP_OPEN_ACK: + mds_sessions.insert(from); + break; + + case MClientSession::OP_CLOSE_ACK: + mds_sessions.erase(from); + // FIXME: kick requests (hard) so that they are redirected. or fail. + break; + + default: + assert(0); } + + // kick waiting threads + for (list::iterator p = waiting_for_session[from].begin(); + p != waiting_for_session[from].end(); + ++p) + (*p)->Signal(); + waiting_for_session.erase(from); + + delete m; +} + + +void Client::send_request(MetaRequest *request, int mds) +{ + MClientRequest *r = request->request; + if (!r) { + // make a new one + dout(10) << "send_request rebuilding request " << request->tid + << " for mds" << mds << endl; + r = new MClientRequest; + r->copy_payload(request->request_payload); + r->decode_payload(); + r->set_retry_attempt(request->retry_attempt); + } + request->request = 0; + + dout(10) << "send_request " << *r << " to mds" << mds << endl; + messenger->send_message(r, mdsmap->get_inst(mds), MDS_PORT_SERVER); - // got it! - MClientReply *reply = mds_rpc_reply[tid]; - - // kick dispatcher (we've got it!) - assert(mds_rpc_dispatch_cond.count(tid)); - mds_rpc_dispatch_cond[tid]->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl; - - // clean up. - mds_rpc_cond.erase(tid); - mds_rpc_reply.erase(tid); + request->mds.insert(mds); +} - return reply; +void Client::handle_client_request_forward(MClientRequestForward *fwd) +{ + tid_t tid = fwd->get_tid(); + + if (mds_requests.count(tid) == 0) { + dout(10) << "handle_client_request_forward no pending request on tid " << tid << endl; + delete fwd; + return; + } + + MetaRequest *request = mds_requests[tid]; + assert(request); + + // reset retry counter + request->retry_attempt = 0; + + if (request->idempotent && + mds_sessions.count(fwd->get_dest_mds())) { + // dest mds has a session, and request was forwarded for us. + + // note new mds set. + if (request->num_fwd < fwd->get_num_fwd()) { + // there are now exactly two mds's whose failure should trigger a resend + // of this request. + request->mds.clear(); + request->mds.insert(fwd->get_source().num()); + request->mds.insert(fwd->get_dest_mds()); + request->num_fwd = fwd->get_num_fwd(); + dout(10) << "handle_client_request tid " << tid + << " fwd " << fwd->get_num_fwd() + << " to mds" << fwd->get_dest_mds() + << ", mds set now " << request->mds + << endl; + } else { + dout(10) << "handle_client_request tid " << tid + << " previously forwarded to mds" << fwd->get_dest_mds() + << ", mds still " << request->mds + << endl; + } + } else { + // request not forwarded, or dest mds has no session. + // resend. + dout(10) << "handle_client_request tid " << tid + << " fwd " << fwd->get_num_fwd() + << " to mds" << fwd->get_dest_mds() + << ", non-idempotent, resending to " << fwd->get_dest_mds() + << endl; + + request->mds.clear(); + request->num_fwd = fwd->get_num_fwd(); + request->resend_mds = fwd->get_dest_mds(); + request->caller_cond->Signal(); + } + + delete fwd; } void Client::handle_client_reply(MClientReply *reply) { tid_t tid = reply->get_tid(); + if (mds_requests.count(tid) == 0) { + dout(10) << "handle_client_reply no pending request on tid " << tid << endl; + delete reply; + return; + } + MetaRequest *request = mds_requests[tid]; + assert(request); + // store reply - mds_rpc_reply[tid] = reply; + request->reply = reply; // wake up waiter - assert(mds_rpc_cond.count(tid)); - dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl; - mds_rpc_cond[tid]->Signal(); + request->caller_cond->Signal(); // wake for kick back - assert(mds_rpc_dispatch_cond.count(tid) == 0); Cond cond; - mds_rpc_dispatch_cond[tid] = &cond; - while (mds_rpc_cond.count(tid)) { + request->dispatch_cond = &cond; + while (mds_requests.count(tid)) { dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; cond.Wait(client_lock); } - - // ok, clean up! - mds_rpc_dispatch_cond.erase(tid); } @@ -645,13 +815,24 @@ void Client::dispatch(Message *m) case MSG_OSD_MAP: objecter->handle_osd_map((class MOSDMap*)m); + mount_cond.Signal(); break; - // client + // mounting and mds sessions case MSG_MDS_MAP: handle_mds_map((MMDSMap*)m); break; - + case MSG_CLIENT_UNMOUNT: + handle_unmount(m); + break; + case MSG_CLIENT_SESSION: + handle_client_session((MClientSession*)m); + break; + + // requests + case MSG_CLIENT_REQUEST_FORWARD: + handle_client_request_forward((MClientRequestForward*)m); + break; case MSG_CLIENT_REPLY: handle_client_reply((MClientReply*)m); break; @@ -660,12 +841,6 @@ void Client::dispatch(Message *m) handle_file_caps((MClientFileCaps*)m); break; - case MSG_CLIENT_MOUNTACK: - handle_mount_ack((MClientMountAck*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount_ack(m); - break; default: @@ -695,24 +870,94 @@ void Client::dispatch(Message *m) void Client::handle_mds_map(MMDSMap* m) { - if (mdsmap == 0) + int frommds = -1; + if (m->get_source().is_mds()) + frommds = m->get_source().num(); + + if (mdsmap == 0) mdsmap = new MDSMap; if (whoami < 0) { + // mounted! + assert(m->get_source().is_mon()); whoami = m->get_dest().num(); dout(1) << "handle_mds_map i am now " << m->get_dest() << endl; messenger->reset_myname(m->get_dest()); + + mount_cond.Signal(); // mount might be waiting for this. } dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl; mdsmap->decode(m->get_encoded()); + // send reconnect? + if (frommds >= 0 && + mdsmap->get_state(frommds) == MDSMap::STATE_RECONNECT) { + send_reconnect(frommds); + } + + // kick requests? + if (frommds >= 0 && + mdsmap->get_state(frommds) == MDSMap::STATE_ACTIVE) { + kick_requests(frommds); + //failed_mds.erase(from); + } + delete m; +} + +void Client::send_reconnect(int mds) +{ + dout(10) << "send_reconnect to mds" << mds << endl; - // note our inc # - objecter->set_client_incarnation(0); // fixme + MClientReconnect *m = new MClientReconnect; - mount_cond.Signal(); // mount might be waiting for this. + if (mds_sessions.count(mds)) { + // i have an open session. + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + p++) { + if (p->second->caps.count(mds)) { + dout(10) << " caps on " << p->first + << " " << cap_string(p->second->caps[mds].caps) + << " wants " << cap_string(p->second->file_caps_wanted()) + << endl; + m->add_inode_caps(p->first, + p->second->caps[mds].caps, + p->second->caps[mds].seq, + p->second->file_caps_wanted(), + p->second->inode.size, + p->second->inode.mtime, p->second->inode.atime); + string path; + p->second->make_path(path); + dout(10) << " path on " << p->first << " is " << path << endl; + m->add_inode_path(p->first, path); + } + if (p->second->stale_caps.count(mds)) { + dout(10) << " clearing stale caps on " << p->first << endl; + p->second->stale_caps.erase(mds); // hrm, is this right? + } + } + } else { + dout(10) << " i had no session with this mds"; + m->closed = true; + } + + messenger->send_message(m, mdsmap->get_inst(mds), MDS_PORT_SERVER); +} + + +void Client::kick_requests(int mds) +{ + dout(10) << "kick_requests for mds" << mds << endl; + + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) + if (p->second->mds.count(mds)) { + p->second->retry_attempt++; // inc retry counter + send_request(p->second, mds); + } } @@ -745,7 +990,7 @@ void Client::handle_file_caps(MClientFileCaps *m) m->clear_payload(); // for if/when we send back to MDS // reap? - if (m->get_special() == MClientFileCaps::FILECAP_REAP) { + if (m->get_special() == MClientFileCaps::OP_REAP) { int other = m->get_mds(); if (in && in->stale_caps.count(other)) { @@ -774,7 +1019,7 @@ void Client::handle_file_caps(MClientFileCaps *m) assert(in); // stale? - if (m->get_special() == MClientFileCaps::FILECAP_STALE) { + if (m->get_special() == MClientFileCaps::OP_STALE) { dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl; // move to stale list @@ -803,7 +1048,7 @@ void Client::handle_file_caps(MClientFileCaps *m) } // release? - if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) { + if (m->get_special() == MClientFileCaps::OP_RELEASE) { dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; assert(in->caps.count(mds)); in->caps.erase(mds); @@ -945,7 +1190,7 @@ void Client::implemented_caps(MClientFileCaps *m, Inode *in) << ", acking to " << m->get_source() << endl; if (in->file_caps() == 0) { - in->file_wr_mtime = 0; + in->file_wr_mtime = utime_t(); in->file_wr_size = 0; } @@ -979,7 +1224,7 @@ void Client::release_caps(Inode *in, } if (in->file_caps() == 0) { - in->file_wr_mtime = 0; + in->file_wr_mtime = utime_t(); in->file_wr_size = 0; } } @@ -1011,31 +1256,21 @@ void Client::update_caps_wanted(Inode *in) int Client::mount() { client_lock.Lock(); - assert(!mounted); // caller is confused? + assert(!mdsmap); - // FIXME mds map update race with mount. - - dout(2) << "sending boot msg to monitor" << endl; - if (mdsmap) - delete mdsmap; int mon = monmap->pick_mon(); - messenger->send_message(new MClientBoot(), - monmap->get_inst(mon)); + dout(2) << "sending client_mount to mon" << mon << endl; + messenger->send_message(new MClientMount, monmap->get_inst(mon)); while (!mdsmap) mount_cond.Wait(client_lock); - dout(2) << "mounting" << endl; - MClientMount *m = new MClientMount(); + mounted = true; - int who = 0; // mdsmap->get_root(); // mount at root, for now - messenger->send_message(m, - mdsmap->get_inst(who), - MDS_PORT_SERVER); - - while (!mounted) - mount_cond.Wait(client_lock); + dout(2) << "mounted: have osdmap " << osdmap->get_epoch() + << " and mdsmap " << mdsmap->get_epoch() + << endl; client_lock.Unlock(); @@ -1052,22 +1287,6 @@ int Client::mount() return 0; } -void Client::handle_mount_ack(MClientMountAck *m) -{ - // mdsmap! - if (!mdsmap) mdsmap = new MDSMap; - mdsmap->decode(m->get_mds_map_state()); - - // we got osdmap! - osdmap->decode(m->get_osd_map_state()); - - dout(2) << "mounted" << endl; - mounted = true; - mount_cond.Signal(); - - delete m; -} - int Client::unmount() { @@ -1125,24 +1344,40 @@ int Client::unmount() } } - // send unmount! - Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT); - messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER); + // send session closes! + for (set::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + dout(2) << "sending client_session close to mds" << *p << endl; + messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), + mdsmap->get_inst(*p), MDS_PORT_SERVER); + } + // send unmount! + int mon = monmap->pick_mon(); + dout(2) << "sending client_unmount to mon" << mon << endl; + messenger->send_message(new MClientUnmount, monmap->get_inst(mon)); + while (mounted) mount_cond.Wait(client_lock); - dout(2) << "unmounted" << endl; + dout(2) << "unmounted." << endl; client_lock.Unlock(); return 0; } -void Client::handle_unmount_ack(Message* m) +void Client::handle_unmount(Message* m) { - dout(1) << "got unmount ack" << endl; + dout(1) << "handle_unmount got ack" << endl; + mounted = false; + + delete mdsmap; + mdsmap = 0; + mount_cond.Signal(); + delete m; } @@ -1163,7 +1398,7 @@ int Client::link(const char *existing, const char *newname) // sarg is target (existing file) - MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); req->set_path(newname); req->set_sarg(existing); @@ -1171,7 +1406,7 @@ int Client::link(const char *existing, const char *newname) req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); @@ -1197,7 +1432,7 @@ int Client::unlink(const char *relpath) tout << path << endl; - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); req->set_path(path); // FIXME where does FUSE maintain user information @@ -1206,7 +1441,7 @@ int Client::unlink(const char *relpath) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); if (res == 0) { // remove from local cache @@ -1243,7 +1478,7 @@ int Client::rename(const char *relfrom, const char *relto) tout << to << endl; - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); req->set_path(from); req->set_sarg(to); @@ -1253,7 +1488,7 @@ int Client::rename(const char *relfrom, const char *relto) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -1280,9 +1515,9 @@ int Client::mkdir(const char *relpath, mode_t mode) tout << mode << endl; - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); req->set_path(path); - req->set_iarg( (int)mode ); + req->args.mkdir.mode = mode; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); @@ -1290,7 +1525,7 @@ int Client::mkdir(const char *relpath, mode_t mode) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -1314,7 +1549,7 @@ int Client::rmdir(const char *relpath) tout << path << endl; - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); req->set_path(path); // FIXME where does FUSE maintain user information @@ -1323,7 +1558,7 @@ int Client::rmdir(const char *relpath) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); if (res == 0) { // remove from local cache @@ -1363,7 +1598,7 @@ int Client::symlink(const char *reltarget, const char *rellink) tout << link << endl; - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); req->set_path(link); req->set_sarg(target); @@ -1373,7 +1608,7 @@ int Client::symlink(const char *reltarget, const char *rellink) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); //FIXME assuming trace of link, not of target delete reply; @@ -1432,7 +1667,7 @@ int Client::_lstat(const char *path, int mask, Inode **in) Dentry *dn = lookup(fpath); inode_t inode; - time_t now = time(NULL); + utime_t now = g_clock.real_now(); if (dn && now <= dn->inode->valid_until && ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { @@ -1440,7 +1675,7 @@ int Client::_lstat(const char *path, int mask, Inode **in) dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = 0; // only one stat allowed after each readdir + dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir *in = dn->inode; } else { @@ -1449,8 +1684,8 @@ int Client::_lstat(const char *path, int mask, Inode **in) //req->set_caller_uid(fc->uid); //req->set_caller_gid(fc->gid); - req = new MClientRequest(MDS_OP_LSTAT, whoami); - req->set_iarg(mask); + req = new MClientRequest(MDS_OP_LSTAT, messenger->get_myinst()); + req->args.stat.mask = mask; req->set_path(fpath); MClientReply *reply = make_request(req); @@ -1482,7 +1717,7 @@ void Client::fill_stat(inode_t& inode, struct stat *st) st->st_nlink = inode.nlink; st->st_uid = inode.uid; st->st_gid = inode.gid; - st->st_ctime = inode.ctime; + st->st_ctime = MAX(inode.ctime, inode.mtime); st->st_atime = inode.atime; st->st_mtime = inode.mtime; st->st_size = inode.size; @@ -1500,7 +1735,7 @@ void Client::fill_statlite(inode_t& inode, struct statlite *st) st->st_gid = inode.gid; #ifndef DARWIN // FIXME what's going on here with darwin? - st->st_ctime = inode.ctime; + st->st_ctime = MAX(inode.ctime, inode.mtime); st->st_atime = inode.atime; st->st_mtime = inode.mtime; #endif @@ -1561,11 +1796,12 @@ int Client::lstatlite(const char *relpath, struct statlite *stl) tout << path << endl; // make mask - int mask = INODE_MASK_BASE | INODE_MASK_PERM; + // FIXME. + int mask = INODE_MASK_BASE | INODE_MASK_AUTH; if (S_ISVALIDSIZE(stl->st_litemask) || S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME; + if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_FILE; + if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_FILE; Inode *in = 0; int res = _lstat(path, mask, &in); @@ -1596,15 +1832,15 @@ int Client::chmod(const char *relpath, mode_t mode) tout << mode << endl; - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); req->set_path(path); - req->set_iarg( (int)mode ); + req->args.chmod.mode = mode; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -1630,10 +1866,10 @@ int Client::chown(const char *relpath, uid_t uid, gid_t gid) tout << gid << endl; - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); req->set_path(path); - req->set_iarg( (int)uid ); - req->set_iarg2( (int)gid ); + req->args.chown.uid = uid; + req->args.chown.gid = gid; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); @@ -1641,7 +1877,7 @@ int Client::chown(const char *relpath, uid_t uid, gid_t gid) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -1668,10 +1904,12 @@ int Client::utime(const char *relpath, struct utimbuf *buf) tout << buf->modtime << endl; - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); req->set_path(path); - req->set_targ( buf->modtime ); - req->set_targ2( buf->actime ); + req->args.utime.mtime.tv_sec = buf->modtime; + req->args.utime.mtime.tv_usec = 0; + req->args.utime.atime.tv_sec = buf->actime; + req->args.utime.atime.tv_usec = 0; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); @@ -1679,7 +1917,7 @@ int Client::utime(const char *relpath, struct utimbuf *buf) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -1706,9 +1944,9 @@ int Client::mknod(const char *relpath, mode_t mode) tout << mode << endl; - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); req->set_path(path); - req->set_iarg( mode ); + req->args.mknod.mode = mode; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); @@ -1716,7 +1954,7 @@ int Client::mknod(const char *relpath, mode_t mode) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); @@ -1752,7 +1990,7 @@ int Client::getdir(const char *relpath, map& contents) tout << path << endl; - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); req->set_path(path); // FIXME where does FUSE maintain user information @@ -1761,7 +1999,7 @@ int Client::getdir(const char *relpath, map& contents) //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); @@ -1781,16 +2019,18 @@ int Client::getdir(const char *relpath, map& contents) contents[dotdot] = diri->dn->dir->parent_inode->inode; } + // the rest? if (!reply->get_dir_in().empty()) { // only open dir if we're actually adding stuff to it! Dir *dir = diri->open_dir(); assert(dir); - time_t now = time(NULL); + utime_t now = g_clock.real_now(); list::const_iterator pdn = reply->get_dir_dn().begin(); for (list::const_iterator pin = reply->get_dir_in().begin(); pin != reply->get_dir_in().end(); ++pin, ++pdn) { + // ignore . if (*pdn == ".") continue; @@ -1800,10 +2040,14 @@ int Client::getdir(const char *relpath, map& contents) // put in cache Inode *in = this->insert_inode(dir, *pin, *pdn); - if (g_conf.client_cache_stat_ttl) - in->valid_until = now + g_conf.client_cache_stat_ttl; - else if (g_conf.client_cache_readdir_ttl) - in->valid_until = now + g_conf.client_cache_readdir_ttl; + if (g_conf.client_cache_stat_ttl) { + in->valid_until = now; + in->valid_until += g_conf.client_cache_stat_ttl; + } + else if (g_conf.client_cache_readdir_ttl) { + in->valid_until = now; + in->valid_until += g_conf.client_cache_readdir_ttl; + } // contents to caller too! contents[*pdn] = in->inode; @@ -1811,7 +2055,6 @@ int Client::getdir(const char *relpath, map& contents) if (dir->is_empty()) close_dir(dir); } - // FIXME: remove items in cache that weren't in my readdir? // *** @@ -2014,7 +2257,7 @@ struct dirent_lite *Client::readdirlite(DIR *dirp) /****** file i/o **********/ -int Client::open(const char *relpath, int flags) +int Client::open(const char *relpath, int flags, mode_t mode) { client_lock.Lock(); @@ -2027,33 +2270,19 @@ int Client::open(const char *relpath, int flags) tout << path << endl; tout << flags << endl; - int cmode = 0; - bool tryauth = false; - if (flags & O_LAZY) - cmode = FILE_MODE_LAZY; - else if (flags & O_WRONLY) { - cmode = FILE_MODE_W; - tryauth = true; - } else if (flags & O_RDWR) { - cmode = FILE_MODE_RW; - tryauth = true; - } else if (flags & O_APPEND) { - cmode = FILE_MODE_W; - tryauth = true; - } else - cmode = FILE_MODE_R; - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); req->set_path(path); - req->set_iarg(flags); - req->set_iarg2(cmode); + req->args.open.flags = flags; + req->args.open.mode = mode; + + int cmode = req->get_open_file_mode(); // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - MClientReply *reply = make_request(req, tryauth); // try auth if writer + MClientReply *reply = make_request(req); assert(reply); dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; @@ -2090,13 +2319,15 @@ int Client::open(const char *relpath, int flags) assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { + int old_caps = f->inode->caps[mds].caps; + dout(7) << "open got caps " << cap_string(new_caps) + << " (had " << cap_string(old_caps) << ")" << " for " << f->inode->ino() << " seq " << reply->get_file_caps_seq() << " from mds" << mds << endl; - int old_caps = f->inode->caps[mds].caps; f->inode->caps[mds].caps = new_caps; f->inode->caps[mds].seq = reply->get_file_caps_seq(); @@ -2423,7 +2654,7 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; // time it. - utime_t start = g_clock.now(); + utime_t start = g_clock.real_now(); // copy into fresh buffer (since our write may be resub, async) bufferptr bp = buffer::copy(buf, size); @@ -2488,7 +2719,7 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) } // time - utime_t lat = g_clock.now(); + utime_t lat = g_clock.real_now(); lat -= start; if (client_logger) { client_logger->finc("wrlsum",(double)lat); @@ -2507,7 +2738,7 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) } // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.gettime(); + in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); // ok! client_lock.Unlock(); @@ -2515,24 +2746,24 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) } -int Client::truncate(const char *file, off_t size) +int Client::truncate(const char *file, off_t length) { client_lock.Lock(); - dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl; + dout(3) << "op: client->truncate(\"" << file << "\", " << length << ");" << endl; tout << "truncate" << endl; tout << file << endl; - tout << size << endl; + tout << length << endl; - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami); + MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); req->set_path(file); - req->set_sizearg( size ); + req->args.truncate.length = length; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; @@ -2743,7 +2974,7 @@ void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) if (dest.is_mon()) { // resend to a different monitor. int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst + dout(0) << "ms_handle_failure " << *m << " to " << inst << ", resending to mon" << mon << endl; messenger->send_message(m, monmap->get_inst(mon)); @@ -2752,14 +2983,12 @@ void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) objecter->ms_handle_failure(m, dest, inst); } else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl; - // help! - assert(0); + dout(0) << "ms_handle_failure " << *m << " to " << inst << endl; + //failed_mds.insert(dest.num()); } else { // client? - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; + dout(0) << "ms_handle_failure " << *m << " to " << inst << ", dropping" << endl; delete m; } } diff --git a/trunk/ceph/client/Client.h b/trunk/ceph/client/Client.h index 513a840d62670..ad2953f71dbd8 100644 --- a/trunk/ceph/client/Client.h +++ b/trunk/ceph/client/Client.h @@ -25,11 +25,8 @@ #include "msg/Messenger.h" #include "msg/SerialMessenger.h" -#include "messages/MClientRequest.h" #include "messages/MClientReply.h" -//#include "msgthread.h" - #include "include/types.h" #include "include/lru.h" #include "include/filepath.h" @@ -47,8 +44,10 @@ using namespace std; #include using namespace __gnu_cxx; -#define O_LAZY 01000000 +class MClientSession; +class MClientRequest; +class MClientRequestForward; class Filer; class Objecter; @@ -122,18 +121,18 @@ class InodeCap { class Inode { public: inode_t inode; // the actual inode - time_t valid_until; + utime_t valid_until; // about the dir (if this is one!) int dir_auth; - set dir_contacts; + set dir_contacts; bool dir_hashed, dir_replicated; // per-mds caps map caps; // mds -> InodeCap map stale_caps; // mds -> cap .. stale - time_t file_wr_mtime; // [writers] time of last write + utime_t file_wr_mtime; // [writers] time of last write off_t file_wr_size; // [writers] largest offset we've written to int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers @@ -154,6 +153,15 @@ class Inode { list waitfor_lazy; list waitfor_no_read, waitfor_no_write; + void make_path(string& p) { + if (dn) { + if (dn->dir && dn->dir->parent_inode) + dn->dir->parent_inode->make_path(p); + p += "/"; + p += dn->name; + } + } + void get() { ref++; //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; @@ -165,9 +173,9 @@ class Inode { Inode(inode_t _inode, ObjectCacher *_oc) : inode(_inode), - valid_until(0), + valid_until(0, 0), dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0), file_wr_size(0), + file_wr_mtime(0, 0), file_wr_size(0), num_open_rd(0), num_open_wr(0), num_open_lazy(0), ref(0), dir(0), dn(0), symlink(0), fc(_oc, _inode), @@ -317,11 +325,47 @@ class Client : public Dispatcher { int whoami; MonMap *monmap; - // mds fake RPC + // mds sessions + set mds_sessions; + map > waiting_for_session; + + void handle_client_session(MClientSession *m); + void send_reconnect(int mds); + + // mds requests + struct MetaRequest { + tid_t tid; + MClientRequest *request; + bufferlist request_payload; // in case i have to retry + + bool idempotent; // is request idempotent? + set mds; // who i am asking + int resend_mds; // someone wants you to (re)send the request here + int num_fwd; // # of times i've been forwarded + int retry_attempt; + + MClientReply *reply; // the reply + + Cond *caller_cond; // who to take up + Cond *dispatch_cond; // who to kick back + + MetaRequest(MClientRequest *req, tid_t t) : + tid(t), request(req), + idempotent(false), resend_mds(-1), num_fwd(0), retry_attempt(0), + reply(0), + caller_cond(0), dispatch_cond(0) { } + }; tid_t last_tid; - map mds_rpc_cond; - map mds_rpc_reply; - map mds_rpc_dispatch_cond; + map mds_requests; + set failed_mds; + + MClientReply *make_request(MClientRequest *req, int use_auth=-1); + int choose_target_mds(MClientRequest *req); + void send_request(MetaRequest *request, int mds); + void kick_requests(int mds); + void handle_client_request_forward(MClientRequestForward *reply); + void handle_client_reply(MClientReply *reply); + // cluster descriptors MDSMap *mdsmap; @@ -412,6 +456,7 @@ protected: // link to dir dn->dir = dir; + //cout << "link dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << in->inode.ino << endl; dir->dentries[dn->name] = dn; // link to inode @@ -430,7 +475,7 @@ protected: dn->inode = 0; in->dn = 0; put_inode(in); - + // unlink from dir dn->dir->dentries.erase(dn->name); if (dn->dir->is_empty()) @@ -450,6 +495,8 @@ protected: strcpy((char*)dn->name, name.c_str()); dir->dentries[dn->name] = dn; */ + //cout << "relink dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << dn->inode->inode.ino << endl; + dir->dentries[name] = dn; // unlink from old dir @@ -476,11 +523,6 @@ protected: // find dentry based on filepath Dentry *lookup(filepath& path); - // make blocking mds request - MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1); - MClientReply* sendrecv(MClientRequest *req, int mds); - void handle_client_reply(MClientReply *reply); - void fill_stat(inode_t& inode, struct stat *st); void fill_statlite(inode_t& inode, struct statlite *st); @@ -501,8 +543,7 @@ protected: // messaging void dispatch(Message *m); - void handle_mount_ack(class MClientMountAck*); - void handle_unmount_ack(Message*); + void handle_unmount(Message*); void handle_mds_map(class MMDSMap *m); // file caps @@ -571,7 +612,7 @@ protected: // file ops int mknod(const char *path, mode_t mode); - int open(const char *path, int mode); + int open(const char *path, int flags, mode_t mode=0); int close(fh_t fh); off_t lseek(fh_t fh, off_t offset, int whence); int read(fh_t fh, char *buf, off_t size, off_t offset=-1); diff --git a/trunk/ceph/client/SyntheticClient.cc b/trunk/ceph/client/SyntheticClient.cc index 66c1c93dab996..d6adf65cbdcf7 100644 --- a/trunk/ceph/client/SyntheticClient.cc +++ b/trunk/ceph/client/SyntheticClient.cc @@ -20,7 +20,8 @@ using namespace std; #include "SyntheticClient.h" #include "include/filepath.h" -#include "mds/MDS.h" +#include "mds/mdstypes.h" +#include "common/Logger.h" #include #include @@ -121,6 +122,16 @@ void parse_syn_options(vector& args) syn_sargs.push_back( args[++i] ); syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"thrashlinks") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_THRASHLINKS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + + } else if (strcmp(args[i],"foo") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_FOO ); + } else if (strcmp(args[i],"until") == 0) { syn_modes.push_back( SYNCLIENT_MODE_UNTIL ); syn_iargs.push_back( atoi(args[++i]) ); @@ -221,6 +232,11 @@ int SyntheticClient::run() dout(3) << "mode " << mode << endl; switch (mode) { + case SYNCLIENT_MODE_FOO: + if (run_me()) + foo(); + break; + case SYNCLIENT_MODE_RANDOMSLEEP: { int iarg1 = iargs.front(); @@ -339,6 +355,22 @@ int SyntheticClient::run() break; + case SYNCLIENT_MODE_THRASHLINKS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + int iarg4 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; + thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); + } + } + break; + + + case SYNCLIENT_MODE_MAKEFILES: { int num = iargs.front(); iargs.pop_front(); @@ -1121,7 +1153,7 @@ int SyntheticClient::random_walk(int num_req) // descend? if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { string s = get_random_subdir(); - cwd.add_dentry( s ); + cwd.push_dentry( s ); dout(DBL) << "cd " << s << " -> " << cwd << endl; clear_dir(); continue; @@ -1323,3 +1355,111 @@ void SyntheticClient::make_dir_mess(const char *basedir, int n) } + + +void SyntheticClient::foo() +{ + // link fun + client->mknod("one", 0755); + client->mknod("two", 0755); + client->link("one", "three"); + client->mkdir("dir", 0755); + client->link("two", "/dir/twolink"); + client->link("dir/twolink", "four"); + + // unlink fun + client->mknod("a", 0644); + client->unlink("a"); + client->mknod("b", 0644); + client->link("b", "c"); + client->unlink("c"); + client->mkdir("d", 0755); + client->unlink("d"); + client->rmdir("d"); + + // rename fun + client->mknod("p1", 0644); + client->mknod("p2", 0644); + client->rename("p1","p2"); + client->mknod("p3", 0644); + client->rename("p3","p4"); + + // check dest dir ambiguity thing + client->mkdir("dir1", 0755); + client->mkdir("dir2", 0755); + client->rename("p2","dir1/p2"); + client->rename("dir1/p2","dir2/p2"); + client->rename("dir2/p2","/p2"); + + // check primary+remote link merging + client->link("p2","p2.l"); + client->link("p4","p4.l"); + client->rename("p2.l","p2"); + client->rename("p4","p4.l"); + + // check anchor updates + client->mknod("dir1/a", 0644); + client->link("dir1/a", "da1"); + client->link("dir1/a", "da2"); + client->link("da2","da3"); + client->rename("dir1/a","dir2/a"); + client->rename("dir2/a","da2"); + client->rename("da1","da2"); + client->rename("da2","da3"); + + // check directory renames + client->mkdir("dir3", 0755); + client->mknod("dir3/asdf", 0644); + client->mkdir("dir4", 0755); + client->mkdir("dir5", 0755); + client->mknod("dir5/asdf", 0644); + client->rename("dir3","dir4"); // ok + client->rename("dir4","dir5"); // fail +} + +int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) +{ + dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth + << " links " << n + << endl; + + if (time_to_stop()) return 0; + + // now link shit up + for (int i=0; ilink(file.c_str(), ln.c_str()); + } + + return 0; +} + + diff --git a/trunk/ceph/client/SyntheticClient.h b/trunk/ceph/client/SyntheticClient.h index adcf7584766e6..59300ee893dc1 100644 --- a/trunk/ceph/client/SyntheticClient.h +++ b/trunk/ceph/client/SyntheticClient.h @@ -55,6 +55,8 @@ #define SYNCLIENT_MODE_TRUNCATE 200 +#define SYNCLIENT_MODE_FOO 100 +#define SYNCLIENT_MODE_THRASHLINKS 101 @@ -101,22 +103,22 @@ class SyntheticClient { while (r--) it++; n1 = cwd; - n1.add_dentry( *it ); + n1.push_dentry( *it ); return n1.get_path().c_str(); } filepath n2; const char *get_random_sub() { assert(!contents.empty()); int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_bit().length()) - r += cwd.last_bit().c_str()[0]; // slightly permuted + if (cwd.depth() && cwd.last_dentry().length()) + r += cwd.last_dentry().c_str()[0]; // slightly permuted r %= contents.size(); map::iterator it = contents.begin(); while (r--) it++; n2 = cwd; - n2.add_dentry( it->first ); + n2.push_dentry( it->first ); return n2.get_path().c_str(); } @@ -126,7 +128,7 @@ class SyntheticClient { sprintf(sub_s, "%s.%d", base, rand() % 100); string f = sub_s; sub = cwd; - sub.add_dentry(f); + sub.push_dentry(f); return sub.c_str(); } @@ -197,6 +199,10 @@ class SyntheticClient { int play_trace(Trace& t, string& prefix); void make_dir_mess(const char *basedir, int n); + void foo(); + + int thrash_links(const char *basedir, int dirs, int files, int depth, int n); + }; #endif diff --git a/trunk/ceph/client/fuse.cc b/trunk/ceph/client/fuse.cc index 2feb7472d1c7b..b142609ccda73 100644 --- a/trunk/ceph/client/fuse.cc +++ b/trunk/ceph/client/fuse.cc @@ -157,7 +157,7 @@ static int ceph_open(const char *path, struct fuse_file_info *fi) { int res; - res = client->open(path, fi->flags); + res = client->open(path, fi->flags, 0); if (res < 0) return res; fi->fh = res; return 0; // fuse wants 0 onsucess diff --git a/trunk/ceph/cmonctl.cc b/trunk/ceph/cmonctl.cc new file mode 100644 index 0000000000000..d6e9e04bdd9db --- /dev/null +++ b/trunk/ceph/cmonctl.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mon/MonMap.h" +#include "msg/SimpleMessenger.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" + +#include "common/Timer.h" + +#ifndef DARWIN +#include +#endif // DARWIN + +#include +#include +#include + + +Messenger *messenger = 0; + +class Admin : public Dispatcher { + void dispatch(Message *m) { + switch (m->get_type()) { + case MSG_MON_COMMAND_ACK: + dout(0) << m->get_source() << " -> '" + << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")" + << endl; + messenger->shutdown(); + break; + } + } +} dispatcher; + +int main(int argc, char **argv, char *envp[]) { + + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + // args for fuse + vec_to_argv(args, argc, argv); + + // load monmap + MonMap monmap; + int r = monmap.read(".ceph_monmap"); + assert(r >= 0); + + // build command + MMonCommand *m = new MMonCommand; + string cmd; + for (unsigned i=0; icmd.push_back(string(args[i])); + } + int mon = monmap.pick_mon(); + + dout(0) << "mon" << mon << " <- '" << cmd << "'" << endl; + + // start up network + rank.start_rank(); + messenger = rank.register_entity(entity_name_t(entity_name_t::TYPE_ADMIN)); + messenger->set_dispatcher(&dispatcher); + + // send it + messenger->send_message(m, monmap.get_inst(mon)); + + // wait for messenger to finish + rank.wait(); + + return 0; +} + diff --git a/trunk/ceph/common/Clock.h b/trunk/ceph/common/Clock.h index 106e9e9f23701..9f4e581cf2711 100644 --- a/trunk/ceph/common/Clock.h +++ b/trunk/ceph/common/Clock.h @@ -21,113 +21,10 @@ #include #include -#include #include "Mutex.h" - -// -------- -// utime_t - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) / (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - //return out << t.sec() << "." << t.usec(); - out << (long)t.sec() << "."; - out.setf(std::ios::right); - out.fill('0'); - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; - - //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); -} - +#include "include/utime.h" diff --git a/trunk/ceph/common/Thread.h b/trunk/ceph/common/Thread.h index 43c5f57f4a96c..d1ae1e7674165 100644 --- a/trunk/ceph/common/Thread.h +++ b/trunk/ceph/common/Thread.h @@ -26,9 +26,7 @@ class Thread { Thread() : thread_id(0) {} virtual ~Thread() {} - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - + protected: virtual void *entry() = 0; private: @@ -37,14 +35,13 @@ class Thread { } public: + pthread_t &get_thread_id() { return thread_id; } + bool is_started() { return thread_id != 0; } + bool am_self() { return (pthread_self() == thread_id); } + int create() { return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); } - - bool am_self() { - return (pthread_self() == thread_id); - } - int join(void **prval = 0) { if (thread_id == 0) { cerr << "WARNING: join on thread that was never started" << endl; @@ -53,14 +50,27 @@ class Thread { } int status = pthread_join(thread_id, prval); - if (status == 0) - thread_id = 0; - else { - cout << "join status = " << status << endl; - assert(0); + if (status != 0) { + switch (status) { + case -EINVAL: + cerr << "thread " << thread_id << " join status = EINVAL" << endl; + break; + case -ESRCH: + cerr << "thread " << thread_id << " join status = ESRCH" << endl; + assert(0); + break; + case -EDEADLK: + cerr << "thread " << thread_id << " join status = EDEADLK" << endl; + break; + default: + cerr << "thread " << thread_id << " join status = " << status << endl; + } + assert(0); // none of these should happen. } + thread_id = 0; return status; } + }; #endif diff --git a/trunk/ceph/common/Timer.h b/trunk/ceph/common/Timer.h index 88d9929ac5ae1..80470c3615737 100644 --- a/trunk/ceph/common/Timer.h +++ b/trunk/ceph/common/Timer.h @@ -53,15 +53,11 @@ class Timer { map< utime_t, set > scheduled; // time -> (context ...) hash_map< Context*, utime_t > event_times; // event -> time - // get time of the next event - //Context* get_next_scheduled(utime_t& when); - bool get_next_due(utime_t &when); void register_timer(); // make sure i get a callback void cancel_timer(); // make sure i get a callback - //pthread_t thread_id; bool thread_stop; Mutex lock; bool timed_sleep; diff --git a/trunk/ceph/config.cc b/trunk/ceph/config.cc index 6820ffa327b9f..345f979be929d 100644 --- a/trunk/ceph/config.cc +++ b/trunk/ceph/config.cc @@ -34,19 +34,10 @@ Mutex bufferlock; Mutex _dout_lock; -FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 ); // stripe over 1M objects, 2x replication -//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4 +FileLayout g_OSD_FileLayout( 1<<23, 1, 1<<23, 2 ); // stripe over 8M objects, 2x replication +FileLayout g_OSD_MDDirLayout( 1<<23, 1, 1<<23, 2 ); // 8M objects, 2x replication. (a lie) +FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication -// ?? -//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 ); // this is stupid, but can bring out an ebofs table bug? -FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication - -// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) -FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects -//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 ); // 256 byte bits -//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 ); // 128 byte stripes over 32 1M objects -//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 ); // pathological case to test striping buffer mapping -//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); // old way // fake osd failures: osd -> time std::map g_fake_osd_down; @@ -176,6 +167,8 @@ md_config_t g_conf = { mds_log_before_reply: true, mds_log_flush_on_shutdown: true, mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log + mds_log_eopen_size: 100, // # open inodes per log entry + mds_bal_replicate_threshold: 2000, mds_bal_unreplicate_threshold: 0,//500, mds_bal_hash_rd: 10000, @@ -203,6 +196,8 @@ md_config_t g_conf = { mds_local_osd: false, + mds_thrash_exports: 0, + mds_dump_cache_on_map: false, // --- osd --- osd_rep: OSD_REP_PRIMARY, @@ -633,6 +628,10 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mds_local_osd") == 0) g_conf.mds_local_osd = atoi(args[++i]); + else if (strcmp(args[i], "--mds_thrash_exports") == 0) + g_conf.mds_thrash_exports = atoi(args[++i]); + else if (strcmp(args[i], "--mds_dump_cache_on_map") == 0) + g_conf.mds_dump_cache_on_map = true; else if (strcmp(args[i], "--client_use_random_mds") == 0) g_conf.client_use_random_mds = true; diff --git a/trunk/ceph/config.h b/trunk/ceph/config.h index d88e6d6c664b2..11bc3c2827048 100644 --- a/trunk/ceph/config.h +++ b/trunk/ceph/config.h @@ -168,6 +168,7 @@ struct md_config_t { bool mds_log_before_reply; bool mds_log_flush_on_shutdown; off_t mds_log_import_map_interval; + int mds_log_eopen_size; float mds_bal_replicate_threshold; float mds_bal_unreplicate_threshold; @@ -195,6 +196,8 @@ struct md_config_t { bool mds_local_osd; + int mds_thrash_exports; + bool mds_dump_cache_on_map; // osd int osd_rep; diff --git a/trunk/ceph/doc/Replication.txt b/trunk/ceph/doc/Replication.txt deleted file mode 100644 index 0f8d4c9079e4d..0000000000000 --- a/trunk/ceph/doc/Replication.txt +++ /dev/null @@ -1,19 +0,0 @@ - -Primary copy replication. - -Inodes: - -- The primary's list of replicas (cached_by) is inclusive at all times. -- The primary's list never includes the local node. -- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight. - -- Replicas can be created in two ways: - - via a Discover + DiscoverReply - - via an export and import. (The old auth keeps a copy, and adds itself to the replica list as it exports.) - - -Directories (and their dentries): - -- The primary has an open_by list that is inclusive at all times. -- ..Never includes local node -- No per-dentry replica lists. All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list. \ No newline at end of file diff --git a/trunk/ceph/doc/anchortable.txt b/trunk/ceph/doc/anchortable.txt new file mode 100644 index 0000000000000..d9c0fefc31e08 --- /dev/null +++ b/trunk/ceph/doc/anchortable.txt @@ -0,0 +1,54 @@ + +ANCHOR TABLE PROTOCOL + +MDS sends an update PREPARE to the anchortable MDS. The prepare is +identified by the ino and operation type; only one for each type +(create, update, destroy) can be pending at any time. Both parties +may actually be the same local node, but for simplicity we treat that +situation the same. (That is, we act as if they may fail +independently, even if they can't.) + +The anchortable journals the proposed update, and responds with an +AGREE and a version number. This uniquely identifies the request. + +The MDS can then update the filesystem metadata however it sees fit. +When it is finished (and the results journaled), it sends a COMMIT to +the anchortable. The table journals the commit, frees any state from +the transaction, and sends an ACK. The initiating MDS should then +journal the ACK to complete the transaction. + + +ANCHOR TABLE FAILURE + +If the AT fails before journaling the PREPARE and sending the AGREE, +the initiating MDS will simply retry the request. + +If the AT fails after journaling PREPARE but before journaling COMMIT, +it will resend AGREE to the initiating MDS. + +If the AT fails after the COMMIT, the transaction has been closed, and it +takes no action. If it receives a COMMIT for which it has no open +transaction, it will reply with ACK. + + +INITIATING MDS FAILURE + +If the MDS fails before the metadata update has been journaled, no +action is taken, since nothing is known about the previously proposed +transaction. If an AGREE message is received and there is no +corresponding PREPARE or pending-commit state, and ROLLBACK is sent to +the anchor table. + +If the MDS fails after journaling the metadata update but before +journaling the ACK, it resends COMMIT to the anchor table. If it +receives an AGREE after resending the COMMIT, it simply ignores the +AGREE. The anchortable will respond with an ACK, allowing the +initiating MDS to journal the final ACK and close out the transaction +locally. + +On journal replay, each metadata update (EMetaBlob) encountered that +includes an anchor transaction is noted in the AnchorClient by adding +it to the pending_commit list, and each journaled ACK is removed from +that list. Journal replay may enounter ACKs with no prior metadata +update; these are ignored. When recovery finishes, a COMMIT is sent +for all outstanding transactions. diff --git a/trunk/ceph/doc/caching.txt b/trunk/ceph/doc/caching.txt index 77b02480bcd6e..fe0c78331bd86 100644 --- a/trunk/ceph/doc/caching.txt +++ b/trunk/ceph/doc/caching.txt @@ -1,199 +1,301 @@ +SPANNING TREE PROPERTY + +All metadata that exists in the cache is attached directly or +indirectly to the root inode. That is, if the /usr/bin/vi inode is in +the cache, then /usr/bin, /usr, and / are too, including the inodes, +directory objects, and dentries. + AUTHORITY The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a serial (normally 0) to +Additionally, each replica is assigned a nonce (initial 0) to disambiguate multiple replicas of the same item (see below). - set cached_by; - map cached_by_serial; + map replicas; // maps replicating mds# to nonce The cached_by set _always_ includes all nodes that cache the -partcuarly inode, but may additionally include nodes that used to +partcuarly object, but may additionally include nodes that used to cache it but no longer do. In those cases, an expire message should -be in transit. - - -REPLICA - -The replica maintains a notion of who it believes is the authority for -each replicated inode. There are two possibilities: - - - Ordinarily, this notion is correct. - - If the part of the file system in question was recently exported to - a new MDS, the inodes old authority is acting as a CACHEPROXY, - and will forward relevant messages on to the authority. - -When a repica is expired from cache, and expire is sent to the -authority. The expire includes the serial number issued when the -replica was originally created to disambiguate potentially concurrent -replication activity. - - -EXPORTS - -- The old authority suddenly becomes a replica. It's serial is well - defined. It also becomes a CACHEPROXY, which means its cached_by - remains defined (with an alternate meaning!). While a proxy, the - node will forward relevant messages from the replica to the - authority (but not the other way around--the authority knows all - replicas). - -- Once the export is acked, the old authority sends a - message to the replica notifying it of the new authority. As soon - as all replicas acknowedge receipt of this notice, the old authority - can cease CACHEPROXY responsibilities and become a regular replica. - At this point it's cached_by is no longer defined. - -- Replicas always know who the authority for the inode is, OR they - know prior owner acting as a CACHEPROXY. (They don't know which it - is.) - +be in transit. That is, we have two invariants: -CACHED_BY + 1) the authority's replica set will always include all actual + replicas, and -The authority always has an inclusive list of nodes who cache an item. -As such it can confidently send updates to replicas for locking, -invalidating, etc. When a replica is expired from cache, an expire is -sent to the authority. If the serial matches, the node is removed -from the cached_by list. + 2) cache expiration notices will be reliably delivered to the + authority. +The second invariant is particularly important because the presence of +replicas will pin the metadata object in memory on the authority, +preventing it from being trimmed from the cache. Notification of +expiration of the replicas is required to allow previously replicated +objects from eventually being trimmed from the cache as well. +Each metdata object has a authority bit that indicates whether it is +authoritative or a replica. +REPLICA NONCE + +Each replicated object maintains a "nonce" value, issued by the +authority at the time the replica was created. If the authority has +already created a replica for the given MDS, the new replica will be +issues a new (incremented) nonce. This nonce is attached +to cache expirations, and allows the authority to disambiguate +expirations when multiple replicas of the same object are created and +cache expiration is coincident with replication. That is, when an +old replica is expired from the replicating MDS at the same time that +a new replica is issued by the authority and the resulting messages +cross paths, the authority can tell that it was the old replica that +was expired and effectively ignore the expiration message. The +replica is removed from the replicas map only if the nonce matches. -SUBTREE AUTHORITY DELEGATION: imports versus hashing -Authority is generally defined recursively: an inode's authority -matches the containing directory, and a directory's authority matches -the directory inode's. Thus the authority delegation chain can be -broken/redefined in two ways: +SUBTREE PARTITION - - Imports and exports redefine the directory inode -> directory - linkage, such that the directory authority is explicitly specified - via dir.dir_auth: +Authority of the file system namespace is partitioned using a +subtree-based partitioning strategy. This strategy effectively +separates directory inodes from directory contents, such that the +directory contents are the unit of redelegation. That is, if / is +assigned to mds0 and /usr to mds1, the inode for /usr will be managed +by mds0 (it is part of the / directory), while the contents of /usr +(and everything nested beneath it) will be managed by mds1. - dir.dir_auth == -1 -> directory matches its inode - dir.dir_auth >= 0 -> directory authority is dir.dir_auth +The description for this partition exists solely in the collective +memory of the MDS cluster and in the individual MDS journals. It is +not described in the regular on-disk metadata structures. This is +related to the fact that authority delegation is a property of the +{\it directory} and not the directory's {\it inode}. - - Hashed directories redefine the directory -> inode linkage. In - non-hashed directories, inodes match their containing directory. - In hashed directories, each dentry's authority is defined by a hash - function. +Subsequently, if an MDS is authoritative for a directory inode and does +not yet have any state associated with the directory in its cache, +then it can assume that it is also authoritative for the directory. - inode.hash_seed == 0 -> inode matches containing directory - inode.hash_seed > 0 -> defined by hash(hash_seed, dentry) +Directory state consists of a data object that describes any cached +dentries contained in the directory, information about the +relationship between the cached contents and what appears on disk, and +any delegation of authority. That is, each CDir object has a dir_auth +element. Normally dir_auth has a value of AUTH_PARENT, meaning that +the authority for the directory is the same as the directory's inode. +When dir_auth specifies another metadata server, that directory is +point of authority delegation and becomes a {\it subtree root}. A +CDir is a subtree root iff its dir_auth specifies an MDS id (and is not +AUTH_PARENT). -A directory's "containing_import" (bad name, FIXME) is either the -import or hashed directory that is responsible for delegating a -subtree. Note that the containing_import of a directory may be itself -because it is an import, but it cannot be itself because it is hashed. + - A dir is a subtree root iff dir_auth != AUTH_PARENT. -Thus: + - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the + converse may not be true. - - Import and export operations' manipulation of dir_auth is - completely orthogonal to hashing operations. Hashing methods can - ignore dir_auth, except when they create imports/exports (and break - the inode<->dir auth linkage). +The authority for any metadata object in the cache can be determined +by following the parent pointers toward the root until a subtree root +CDir object is reached, at which point the authority is specified by +its dir_auth. - - Hashdirs act sort of like imports in that they bound an - authoritative region. That is, either hashdirs or imports can be - the key for nested_exports. In some cases, a dir may be both an - import and a hash. +Each MDS cache maintains a subtree data structure that describes the +subtree partition for all objects currently in the cache: - - Export_dir won't export a hashdir. This is because it's tricky - (tho not necessarily impossible) due to the way nested_exports is - used with imports versus hashdirs. + map< CDir*, set > subtrees; + - A dir will appear in the subtree map (as a key) IFF it is a subtree + root. +Each subtree root will have an entry in the map. The map value is a +set of all other subtree roots nested beneath that point. Nested +subtree roots effectively bound or prune a subtree. For example, if +we had the following partition: + mds0 / + mds1 /usr + mds0 /usr/local + mds0 /home -FREEZING - -There are two types of freezing: +The subtree map on mds0 would be - - TREE: recursively freezes everything nested beneath a directory, - until an export of edge of cache is reached. - - DIR: freezes the contents of a single directory. + / -> (/usr, /home) + /home -> () -Some notes: +and on mds1: - - Occurs on the authoritative node only. + /usr -> (/usr/local) - - Used for suspending critical operations while migrating authority - between nodes or hashing/unhashing directories. - - Freezes the contents of the cache such that items may not be added, - items cannot be auth pinned, and/or subsequently reexported. The - namespace of the affected portions of the hierarchy may not change. - The content of inodes and other orthogonal operations - (e.g. replication, inode locking and modification) are unaffected. - -Two states are defined: freezing and frozen. The freezing state is -used while waiting for auth_pins to be removed. Once all auth_pins -are gone, the state is changed to frozen. New auth_pins cannot be -added while freezing or frozen. +AMBIGUOUS DIR_AUTH + +While metadata for a subtree is being migrated between two MDS nodes, +the dir_auth for the subtree root is allowed to be ambiguous. That +is, it will specify both the old and new MDS ids, indicating that a +migration is in progress. + +If a replicated metadata object is expired from the cache from a +subtree whose authority is ambiguous, the cache expiration is sent to +both potential authorities. This ensures that the message will be +reliably delivered, even if either of those nodes fails. A number of +alternative strategies were considered. Sending the expiration to the +old or new authority and having it forwarded if authority has been +delegated can result in message loss if the forwarding node fails. +Pinning ambiguous metadata in cache is computationally expensive for +implementation reasons, and while delaying the transmission of expiration +messages is difficult to implement because the replicating must send +the final expiration messages when the subtree authority is +disambiguated, forcing it to keep certain elements of it cache in +memory. Although duplicated expirations incurs a small communications +overhead, the implementation is much simpler. AUTH PINS -An auth pin keeps a given item on the authoritative node until it is -removed. The pins are tracked recursively, so that a subtree cannot -be frozen if it contains any auth pins. - -If a pin is placed on a non-authoritative item, the item is allowed to -become authoritative; the specific restriction is it cannot be frozen, -which only happens during export-type operations. - - -TYPES OF EXPORTS - -- Actual export of a subtree from one node to another -- A rename between directories on different nodes exports the renamed -_inode_. (If it is a directory, it becomes an export such that the -directory itself does not move.) -- A hash or unhash operation will migrate inodes within the directory -either to or from the directory's main authority. +Most operations that modify metadata must allow some amount of time to +pass in order for the operation to be journaled or for communication +to take place between the object's authority and any replicas. For +this reason it must not only be pinned in the authority's metadata +cache, but also be locked such that the object's authority is not +allowed to change until the operation completes. This is accomplished +using {\it auth pins}, which increment a reference counter on the +object in question, as well as all parent metadata objects up to the +root of the subtree. As long as the pin is in place, it is impossible +for that subtree (or any fragment of it that contains one or more +pins) to be migrated to a different MDS node. Pins can be placed on +both inodes and directories. -EXPORT PROCESS +Auth pins can only exist for authoritative metadata, because they are +only created if the object is authoritative, and their presense +prevents the migration of authority. +FREEZING +More specifically, auth pins prevent a subtree from being frozen. +When a subtree is frozen, all updates to metadata are forbidden. This +includes updates to the replicas map that describes which replicas +(and nonces) exist for each object. + +In order for metadata to be migrated between MDS nodes, it must first +be frozen. The root of the subtree is initially marked as {\it +freezing}. This prevents the creation of any new auth pins within the +subtree. After all existing auth pins are removed, the subtree is +then marked as {\it frozen}, at which point all updates are +forbidden. This allows metadata state to be packaged up in a message +and transmitted to the new authority, without worrying about +intervening updates. + +If the directory at the base of a freezing or frozen subtree is not +also a subtree root (that is, it has dir_auth == AUTH_PARENT), the +directory's parent inode is auth pinned. + + - a frozen tree root dir will auth_pin its inode IFF it is auth AND + not a subtree root. + +This prevents a parent directory from being concurrently frozen, and a +range of resulting implementation complications relating metadata +migration. + + +CACHE EXPIRATION FOR FROZEN SUBTREES + +Cache expiration messages that are received for a subtree that is +frozen are temporarily set aside instead of being processed. Only +when the subtree is unfrozen are the expirations either processed (if +the MDS is authoritative) or discarded (if it is not). Because either +the exporting or importing metadata can fail during the migration +process, the MDS cannot tell whether it will be authoritative or not +until the process completes. + +During a migration, the subtree will first be frozen on both the +exporter and importer, and then all other replicas will be informed of +a subtrees ambiguous authority. This ensures that all expirations +during migration will go to both parties, and nothing will be lost in +the event of a failure. + + + + +NORMAL MIGRATION + +The exporter begins by doing some checks in export_dir() to verify +that it is permissible to export the subtree at this time. In +particular, the cluster must not be degraded, the subtree root may not +be freezing or frozen, and the path must be pinned (\ie not conflicted +with a rename). If these conditions are met, the subtree root +directory is temporarily auth pinned, the subtree freeze is initiated, +and the exporter is committed to the subtree migration, barring an +intervening failure of the importer or itself. + +The MExportDiscover serves simply to ensure that the inode for the +base directory being exported is open on the destination node. It is +pinned by the importer to prevent it from being trimmed. This occurs +before the exporter completes the freeze of the subtree to ensure that +the importer is able to replicate the necessary metadata. When the +exporter receives the MDiscoverAck, it allows the freeze to proceed by +removing its temporary auth pin. + +The MExportPrep message then follows to populate the importer with a +spanning tree that includes all dirs, inodes, and dentries necessary +to reach any nested subtrees within the exported region. This +replicates metadata as well, but it is pushed out by the exporter, +avoiding deadlock with the regular discover and replication process. +The importer is responsible for opening the bounding directories from +any third parties authoritative for those subtrees before +acknowledging. This ensures that the importer has correct dir_auth +information about where authority is redelegated for all points nested +beneath the subtree being migrated. While processing the MExportPrep, +the importer freezes the entire subtree region to prevent any new +replication or cache expiration. + +A warning stage occurs only if the base subtree directory is open by +nodes other than the importer and exporter. If it is not, then this +implies that no metadata within or nested beneath the subtree is +replicated by any node other than the importer an exporter. If it is, +then a MExportWarning message informs any bystanders that the +authority for the region is temporarily ambiguous, and lists both the +exporter and importer as authoritative MDS nodes. In particular, +bystanders who are trimming items from their cache must send +MCacheExpire messages to both the old and new authorities. This is +necessary to ensure that the surviving authority reliably receives all +expirations even if the importer or exporter fails. While the subtree +is frozen (on both the importer and exporter), expirations will not be +immediately processed; instead, they will be queued until the region +is unfrozen and it can be determined that the node is or is not +authoritative. + +The exporter walks the subtree hierarchy and packages up an MExport +message containing all metadata and important state (\eg, information +about metadata replicas). At the same time, the expoter's metadata +objects are flagged as non-authoritative. The MExport message sends +the actual subtree metadata to the importer. Upon receipt, the +importer inserts the data into its cache, marks all objects as +authoritative, and logs a copy of all metadata in an EImportStart +journal message. Once that has safely flushed, it replies with an +MExportAck. The exporter can now log an EExport journal entry, which +ultimately specifies that the export was a success. In the presence +of failures, it is the existence of the EExport entry only that +disambiguates authority during recovery. + +Once logged, the exporter will send an MExportNotify to any +bystanders, informing them that the authority is no longer ambiguous +and cache expirations should be sent only to the new authority (the +importer). Once these are acknowledged back to the exporter, +implicitly flushing the bystander to exporter message streams of any +stray expiration notices, the exporter unfreezes the subtree, cleans +up its migration-related state, and sends a final MExportFinish to the +importer. Upon receipt, the importer logs an EImportFinish(true) +(noting locally that the export was indeed a success), unfreezes its +subtree, processes any queued cache expierations, and cleans up its +state. + + +PARTIAL FAILURE RECOVERY + + + + +RECOVERY FROM JOURNAL -HASHING - -- All nodes discover and open directory - -- Prep message distributes subdir inode replicas for exports so that - peers can open those dirs. This is necessary because subdirs are - converted into exports or imports as needed to avoid migrating - anything except the hashed dir itself. The prep is needed for the - same reasons its important with exports: the inode authority must - always have the exported dir open so that it gets accurate dir - authority updates, and can keep the inode->dir_auth up to date. - -- MHashDir messsage distributes the directory contents. - -- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the - Prep messages won't be inclusive of all dirs, and the - imports/exports won't get set up properly. - -TODO -readdir -- subtrees stop at hashed dir. hashed dir's dir_auth follows parent - subtree, unless the dir is also an explicit import. thus a hashed - dir can also be an import dir. -bananas -apples -blueberries -green pepper -carrots -celery diff --git a/trunk/ceph/doc/exports.txt b/trunk/ceph/doc/exports.txt new file mode 100644 index 0000000000000..8e0e146bea2fe --- /dev/null +++ b/trunk/ceph/doc/exports.txt @@ -0,0 +1,72 @@ + +NORMAL MIGRATION + +The exporter begins by doing some checks in export_dir() to verify +that it is permissible to export the subtree at this time. In +particular, the cluster must not be degraded, the subtree root may not +be freezing or frozen (\ie already exporting, or nested beneath +something that is exporting), and the path must be pinned (\ie not +conflicted with a rename). If these conditions are met, the subtree +freeze is initiated, and the exporter is committed to the subtree +migration, barring an intervening failure of the importer or itself. + +The MExportDiscover serves simply to ensure that the base directory +being exported is open on the destination node. It is pinned by the +importer to prevent it from being trimmed. This occurs before the +exporter completes the freeze of the subtree to ensure that the +importer is able to replicate the necessary metadata. When the +exporter receives the MDiscoverAck, it allows the freeze to proceed. + +The MExportPrep message then follows to populate a spanning tree that +includes all dirs, inodes, and dentries necessary to reach any nested +exports within the exported region. This replicates metadata as well, +but it is pushed out by the exporter, avoiding deadlock with the +regular discover and replication process. The importer is responsible +for opening the bounding directories from any third parties before +acknowledging. This ensures that the importer has correct dir_auth +information about where authority is delegated for all points nested +within the subtree being migrated. While processing the MExportPrep, +the importer freezes the entire subtree region to prevent any new +replication or cache expiration. + +The warning stage occurs only if the base subtree directory is open by +nodes other than the importer and exporter. If so, then a +MExportWarning message informs any bystanders that the authority for +the region is temporarily ambiguous. In particular, bystanders who +are trimming items from their cache must send MCacheExpire messages to +both the old and new authorities. This is necessary to ensure that +the surviving authority reliably receives all expirations even if the +importer or exporter fails. While the subtree is frozen (on both the +importer and exporter), expirations will not be immediately processed; +instead, they will be queued until the region is unfrozen and it can +be determined that the node is or is not authoritative for the region. + +The MExport message sends the actual subtree metadata to the importer. +Upon receipt, the importer inserts the data into its cache, logs a +copy in the EImportStart, and replies with an ExportAck. The exporter +can now log an EExportFinish(true), which ultimately specifies that +the export was a success. In the presence of failures, it is the +existence (and value) of the EExportFinish that disambiguates +authority during recovery. + +Once logged, the exporter will send an MExportNotify to any +bystanders, informing them that the authority is no longer ambiguous +and cache expirations should be sent only to the new authority (the +importer). Once these are acknowledged, implicitly flushing the +bystander to exporter message streams of any stray expiration notices, +the exporter unfreezes the subtree, cleans up its state, and sends a +final MExportFinish to the importer. Upon receipt, the importer logs +an EImportFinish(true), unfreezes its subtree, and cleans up its +state. + + +PARTIAL FAILURE RECOVERY + + + +RECOVERY FROM JOURNAL + + + + + diff --git a/trunk/ceph/doc/mds_locks.txt b/trunk/ceph/doc/mds_locks.txt new file mode 100644 index 0000000000000..f41a89a9b31e5 --- /dev/null +++ b/trunk/ceph/doc/mds_locks.txt @@ -0,0 +1,66 @@ + +new names + dentry_read (not path_pins) + dentry_xlock + + inode_read + inode_xlock (not inode_write) + +locks are always tied to active_requests. + +read locks can be placed on any node. +xlocks must be applied at the authority. + +for multi-lock operations (link, unlink, rename), we must acquire xlocks on a remote node. lock requests are associated with a reqid. the authoritative node keeps track of which remote xlocks it holds. when forwarded/restarted, it can drop remote locks. + +when restarting, drop all locks. +on remote, drop locks and state, and notify main req node. +recover dist request state on rejoin: + - surviving op initiator will assert read or xlock + - recovering op initiator will restart requests. (from initiator's perspective, ops have either happened or they haven't, depending on whether the event is journaled.) + - recovering or surviving op cohort will determine lock state during rejoin, or get a commit or rollback... + - + + +--- path_pin = read lock on /some/random/path + - blocks a dentry xlock + +--- dnxlock = exclusive lock on /some/random/path + - locking: prevents subsequent path pins. + - locked: prevents dn read + - on auth + +-> grab _all_ path pins at onces; hold none while waiting. +-> grab xlocks in order. + +--- auth_pin = pin to authority, on *dir, *in + - prevents freezing -> frozen. + - freezing blocks new auth pins, thus blocking other local auth_pins. (hangs up local export.) + - does not block remote auth_pins, because remote side is not auth (or frozen!) until after local subtree is frozen. + +-> blocking on auth_pins is dangerous. _never_ block if we are holding other auth_pins on the same node (subtree?). +-> grab _all_ auth pins at once; hold none while waiting. + +--- hard/file_wrlock = exlusive lock on inode content + - prevents inode read + - on auth + +-> grab locks in order. + + +ORDERING +- namespace(dentries) < inodes +- order dentries on (dirino, dname) +- order inodes on (ino); +- need to order both read and write locks, esp with dentries. so, if we need to lock /usr/bin/foo with read on usr and bin and xwrite on foo, we need to acquire all of those locks using the same ordering. + - on same host, we can be 'nice' and check lockability of all items, then lock all, and drop everything while waiting. (actually, is there any use to this?) + - on mutiple hosts, we need to use full ordering (at least as things separate across host boundaries). and if needed lock set changes (such that the order of already acquired locks changes), we need to drop those locks and start over. + +- how do auth pins fit into all this? + - auth pin on xlocks only. no need on read locks. + - pre-grab all auth pins on a node the first time it is visiting during lock acquisition. + - what if things move? if we find we are missing a needed auth pin when we revisit a host at any point, and the item is not still authpinnable, we back off and restart. (we cannot block.) + - + - if we find we are not authpinnable, drop all locks and wait. + + diff --git a/trunk/ceph/doc/performance.txt b/trunk/ceph/doc/performance.txt deleted file mode 100644 index 7ca278bd284b1..0000000000000 --- a/trunk/ceph/doc/performance.txt +++ /dev/null @@ -1,36 +0,0 @@ - - -quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13 - -g marshalling -real 3m8.697s -user 2m53.282s -sys 0m6.291s - -real 3m3.337s -user 2m49.467s -sys 0m6.243s - - -g no marshalling -real 2m1.464s -user 1m42.680s -sys 0m8.128s - -real 1m49.469s -user 1m34.523s -sys 0m6.410s - - -O3 marshalling -real 1m29.833s -user 1m11.474s -sys 0m7.588s - -real 1m9.439s -user 0m56.071s -sys 0m5.643s - - - -O3 no marshalling -real 1m2.739s -user 0m46.578s -sys 0m7.882s - diff --git a/trunk/ceph/ebofs/Ebofs.h b/trunk/ceph/ebofs/Ebofs.h index 6d18b7a0204fa..29d2dfb0b025b 100644 --- a/trunk/ceph/ebofs/Ebofs.h +++ b/trunk/ceph/ebofs/Ebofs.h @@ -21,11 +21,6 @@ using namespace __gnu_cxx; #include "include/Context.h" #include "include/buffer.h" -template -inline ostream& operator<<(ostream& out, const pair& p) { - return out << p.first << "," << p.second; -} - #include "types.h" #include "Onode.h" #include "Cnode.h" diff --git a/trunk/ceph/include/Context.h b/trunk/ceph/include/Context.h index b7798afbc93d9..c4c9bf508cf37 100644 --- a/trunk/ceph/include/Context.h +++ b/trunk/ceph/include/Context.h @@ -56,6 +56,12 @@ inline void finish_contexts(std::list& finished, } } +class C_NoopContext : public Context { +public: + void finish(int r) { } +}; + + /* * C_Contexts - set of Contexts */ diff --git a/trunk/ceph/include/buffer.h b/trunk/ceph/include/buffer.h index fbcf4e3c130b6..111cb2a981686 100644 --- a/trunk/ceph/include/buffer.h +++ b/trunk/ceph/include/buffer.h @@ -745,383 +745,226 @@ inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { -// encoder/decode helpers +// ---------------------------------------------------------- +// new encoders -// -- basic types -- -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) +// raw +template +inline void _encoderaw(const T& t, bufferlist& bl) { - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); + bl.append((char*)&t, sizeof(t)); } -inline void _decode(bufferlist& s, bufferlist& bl, int& off) +template +inline void _decoderaw(T& t, bufferlist& bl, int& off) { - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; + bl.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); } - #include #include +#include #include #include +#include -// set -inline void _encode(const std::set& s, bufferlist& bl) +// list +template +inline void _encode(const std::list& ls, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); + __uint32_t n = ls.size(); + _encoderaw(n, bl); + for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) + _encode(*p, bl); } -inline void _decode(std::set& s, bufferlist& bl, int& off) +template +inline void _decode(std::list& ls, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(const std::list& s, bufferlist& bl) +// deque +template +inline void _encode(const std::deque& ls, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); + __uint32_t n = ls.size(); + _encoderaw(n, bl); + for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) + _encode(*p, bl); } -inline void _decode(std::list& s, bufferlist& bl, int& off) +template +inline void _decode(std::deque& ls, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i +// set template inline void _encode(const std::set& s, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); + __uint32_t n = s.size(); + _encoderaw(n, bl); + for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) + _encode(*p, bl); } template -inline void _decode(std::set& s, bufferlist& bl, int& off) +inline void _decode(std::set& s, bufferlist& bl, int& off) { + __uint32_t n; + _decoderaw(n, bl, off); s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i +// vector template -inline void _encode(std::vector& s, bufferlist& bl) +inline void _encode(const std::vector& v, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); + __uint32_t n = v.size(); + _encoderaw(n, bl); + for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) + _encode(*p, bl); } template -inline void _decode(std::vector& s, bufferlist& bl, int& off) +inline void _decode(std::vector& v, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = std::vector(n); - for (int i=0; i -template -inline void _encode(const std::list& s, bufferlist& bl) +// map +template +inline void _encode(const std::map& m, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; + __uint32_t n = m.size(); + _encoderaw(n, bl); + for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { + _encode(p->first, bl); + _encode(p->second, bl); } - assert(n==0); } -template -inline void _decode(std::list& s, bufferlist& bl, int& off) +template +inline void _decode(std::map& m, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(std::map& s, bufferlist& bl) +// hash_map +template +inline void _encode(const __gnu_cxx::hash_map& m, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; + __uint32_t n = m.size(); + _encoderaw(n, bl); + for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { + _encode(p->first, bl); + _encode(p->second, bl); } - assert(n==0); } -inline void _decode(std::map& s, bufferlist& bl, int& off) +template +inline void _decode(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) +// string +inline void _encode(const std::string& s, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - //std::cout << "n = " << n << std::endl; - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - //std::cout << "--n = " << n << " after k " << k << std::endl; - } - assert(n==0); + __uint32_t len = s.length(); + _encoderaw(len, bl); + bl.append(s.c_str(), len+1); } -template -inline void _decode(std::map& s, bufferlist& bl, int& off) +inline void _decode(std::string& s, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) +// bufferptr (encapsulated) +inline void _encode(bufferptr& bp, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(it->first, bl); - U v = it->second; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); + __uint32_t len = bp.length(); + _encoderaw(len, bl); + bl.append(bp); } -template -inline void _decode(std::map& s, bufferlist& bl, int& off) +inline void _decode(bufferptr& bp, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i> -template -inline void _encode(const std::map >& s, bufferlist& bl) +// bufferlist (encapsulated) +inline void _encode(const bufferlist& s, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map >::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - ::_encode(it->second, bl); - n--; - } - assert(n==0); + __uint32_t len = s.length(); + _encoderaw(len, bl); + bl.append(s); } -template -inline void _decode(std::map >& s, bufferlist& bl, int& off) +inline void _decode(bufferlist& s, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) +// base +template +inline void _encode(const T& t, bufferlist& bl) { - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); + _encoderaw(t, bl); } -template -inline void _decode(std::map& s, bufferlist& bl, int& off) +template +inline void _decode(T& t, bufferlist& bl, int& off) { - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i using namespace std; -#include -using namespace __gnu_cxx; - #include "buffer.h" class filepath { + /** path + * can be relative "a/b/c" or absolute "/a/b/c". + */ string path; - vector bits; - void rebuild() { + /** bits - path segemtns + * this is ['a', 'b', 'c'] for both the aboslute and relative case. + * + * NOTE: this value is LAZILY maintained... i.e. it's a cache + */ + mutable vector bits; + + void rebuild_path() { if (absolute()) path = "/"; else @@ -48,7 +54,7 @@ class filepath { path += bits[i]; } } - void parse() { + void parse_bits() const { bits.clear(); int off = 0; while (off < (int)path.length()) { @@ -74,126 +80,97 @@ class filepath { filepath(const char* s) { set_path(s); } - - bool absolute() { return path[0] == '/'; } - bool relative() { return !absolute(); } - - void set_path(const string& s) { - path = s; - parse(); - } - void set_path(const char *s) { - path = s; - parse(); + filepath(const filepath& o) { + set_path(o.get_path()); } - string& get_path() { + + // accessors + const string& get_path() const { return path; } + const char *c_str() const { + return path.c_str(); + } + int length() const { return path.length(); } + unsigned depth() const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits.size(); + } + bool empty() const { + return path.length() == 0; + } - const char *c_str() const { - return path.c_str(); + // FIXME: const-edness + bool absolute() { return path.length() && path[0] == '/'; } + bool relative() { return !absolute(); } + + const string& operator[](int i) const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits[i]; } + const string& last_dentry() const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits[ bits.size()-1 ]; + } filepath prefixpath(int s) const { filepath t; for (int i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - r.append((*it).c_str(), (*it).length()+1); - } + void pop_dentry() { + if (bits.empty() && path.length() > 0) parse_bits(); + bits.pop_back(); + rebuild_path(); + } + void push_dentry(const string& s) { + if (bits.empty() && path.length() > 0) parse_bits(); + bits.push_back(s); + if (path.length() && path[path.length()-1] != '/') + path += "/"; + path += s; } - - void _unrope(crope& r, int& off) { - clear(); - - char n; - r.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - bl.append((*it).c_str(), (*it).length()+1); - } + ::_encode(path, bl); } - void _decode(bufferlist& bl, int& off) { - clear(); - - char n; - bl.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __FRAG_H +#define __FRAG_H + +#include +#include +#include "buffer.h" + +/* + * + * the goal here is to use a binary split strategy to partition a namespace. + * frag_t represents a particular fragment. bits() tells you the size of the + * fragment, and value() it's name. this is roughly analogous to an ip address + * and netmask. + * + * fragtree_t represents an entire namespace and it's partition. it essentially + * tells you where fragments are split into other fragments, and by how much + * (i.e. by how many bits, resulting in a power of 2 number of child fragments). + * + * this vaguely resembles a btree, in that when a fragment becomes large or small + * we can split or merge, except that there is no guarantee of being balanced. + * presumably we are partitioning the output of a (perhaps specialized) hash + * function. + * + */ + +/** + * frag_t + * + * description of an individual fragment. that is, a particular piece + * of the overall namespace. + * + * this is conceptually analogous to an ip address and netmask. + * + * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). + * + * we write it as v/b, where v is a value and b is the number of bits. + * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, + * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + */ + +typedef __uint32_t _frag_t; + +class frag_t { + /* encoded value. + * 8 upper bits = "bits" + * 24 lower bits = "value" + */ + _frag_t _enc; + + public: + frag_t() : _enc(0) { } + frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { } + frag_t(_frag_t e) : _enc(e) { } + + // constructors + void from_unsigned(unsigned e) { _enc = e; } + + // accessors + unsigned value() const { return _enc & 0xffffff; } + unsigned bits() const { return _enc >> 24; } + unsigned mask() const { return 0xffffffff >> (32-bits()); } + operator _frag_t() const { return _enc; } + + // tests + bool contains(unsigned v) const { + return (v & mask()) == value(); + } + bool contains(frag_t sub) const { + return (sub.bits() >= bits() && // they are more specific than us, + (sub.value() & mask()) == value()); // and they are contained by us. + } + bool is_root() const { + return bits() == 0; + } + frag_t parent() const { + assert(bits() > 0); + return frag_t(value() & (mask() >> 1), bits()-1); + } + + // splitting + frag_t left_half() const { + return frag_t(value(), bits()+1); + } + frag_t right_half() const { + return frag_t(value() | (1<& fragments) const { + assert(nb > 0); + unsigned nway = 1 << (nb-1); + for (unsigned i=0; i: + // frag_t f is split by b bits. + // if child frag_t does not appear, it is not split. + std::map _splits; + + public: + // accessors + bool empty() { + return _splits.empty(); + } + int get_split(const frag_t hb) const { + std::map::const_iterator p = _splits.find(hb); + if (p == _splits.end()) + return 0; + else + return p->second; + } + void get_leaves(list& ls) const { + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = get_split(t); + if (nb) + t.split(nb, q); // queue up children + else + ls.push_back(t); // not spit, it's a leaf. + } + } + bool contains(frag_t fg) const { + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = get_split(t); + if (nb) { + if (t == fg) return false; // it's split. + t.split(nb, q); // queue up children + } else { + if (t == fg) return true; // it's there. + } + } + return false; + } + + frag_t operator[](unsigned v) const { + frag_t t; + while (1) { + assert(t.contains(v)); + int nb = get_split(t); + + // is this a leaf? + if (nb == 0) return t; // done. + + // pick appropriate child fragment. + unsigned nway = 1 << (nb-1); + unsigned i; + for (i=0; i copy; + std::list q; + q.push_back(frag_t()); + + while (1) { + frag_t cur = q.front(); + q.pop_front(); + int b = get_split(cur); + if (!b) continue; + copy[cur] = b; + cur.split(b, q); + } + + assert(copy == _splits); + } + + // encoding + void _encode(bufferlist& bl) { + ::_encode(_splits, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(_splits, bl, off); + } +}; + +inline ostream& operator<<(ostream& out, fragtree_t& ft) +{ + out << "fragtree_t("; + + bool first = true; + list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = ft.get_split(t); + if (nb) { + if (first) + first = false; + else + out << ' '; + out << t << '%' << nb; + t.split(nb, q); // queue up children + } + } + return out << ")"; +} + +#endif diff --git a/trunk/ceph/include/object.h b/trunk/ceph/include/object.h index 5d5a87727e5ad..a225c8d245f62 100644 --- a/trunk/ceph/include/object.h +++ b/trunk/ceph/include/object.h @@ -18,6 +18,9 @@ #include using namespace std; +#include +using namespace __gnu_cxx; + typedef __uint32_t objectrev_t; diff --git a/trunk/ceph/include/reqid.h b/trunk/ceph/include/reqid.h deleted file mode 100644 index 3c71fbae69ab6..0000000000000 --- a/trunk/ceph/include/reqid.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REQID_H -#define __REQID_H - - -#include "include/types.h" -#include "msg/msg_types.h" - -/* reqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class reqid_t { -public: - entity_name_t name; // who - int inc; // incarnation - tid_t tid; - reqid_t() : inc(0), tid(0) {} - reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const reqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const reqid_t& l, const reqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const reqid_t& l, const reqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); } -inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const reqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - -#endif diff --git a/trunk/ceph/include/types.h b/trunk/ceph/include/types.h index 72893cb62141b..b2368e153c19b 100644 --- a/trunk/ceph/include/types.h +++ b/trunk/ceph/include/types.h @@ -22,6 +22,7 @@ extern "C" { } #include +#include #include #include #include @@ -29,10 +30,13 @@ extern "C" { #include using namespace std; -#include +#include using namespace __gnu_cxx; + #include "object.h" +#include "utime.h" + #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a):(b)) @@ -127,6 +131,8 @@ typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for +#define O_LAZY 01000000 + /** object layout @@ -183,12 +189,13 @@ struct FileLayout { // -- inode -- +typedef __uint64_t _inodeno_t; struct inodeno_t { - __uint64_t val; - inodeno_t() : val() {} - inodeno_t(__uint64_t v) : val(v) {} + _inodeno_t val; + inodeno_t() : val(0) {} + inodeno_t(_inodeno_t v) : val(v) {} inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator __uint64_t() const { return val; } + operator _inodeno_t() const { return val; } }; inline ostream& operator<<(ostream& out, inodeno_t ino) { @@ -217,51 +224,54 @@ namespace __gnu_cxx { #define FILE_MODE_RW (1|2) #define FILE_MODE_LAZY 4 -#define INODE_MASK_BASE 1 // ino, ctime, nlink -#define INODE_MASK_PERM 2 // uid, gid, mode -#define INODE_MASK_SIZE 4 // size, blksize, blocks -#define INODE_MASK_MTIME 8 // mtime -#define INODE_MASK_ATIME 16 // atime +#define INODE_MASK_BASE 1 // ino, layout, symlink value +#define INODE_MASK_AUTH 2 // uid, gid, mode +#define INODE_MASK_LINK 4 // nlink, anchored +#define INODE_MASK_FILE 8 // mtime, size. +// atime? + +#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_AUTH|INODE_MASK_LINK|INODE_MASK_FILE) -#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME) -//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME) +#define INODE_MASK_SIZE INODE_MASK_FILE // size, blksize, blocks +#define INODE_MASK_MTIME INODE_MASK_FILE // mtime +#define INODE_MASK_ATIME INODE_MASK_FILE // atime +#define INODE_MASK_CTIME (INODE_MASK_FILE|INODE_MASK_AUTH|INODE_MASK_LINK) // ctime struct inode_t { // base (immutable) - inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! - time_t ctime; - - // other + inodeno_t ino; FileLayout layout; // ?immutable? - int nlink; // base, - // hard/perm (namespace permissions) + // affected by any inode change... + utime_t ctime; // inode change time + + // perm (namespace permissions) mode_t mode; uid_t uid; gid_t gid; - // file (data access) - off_t size; - time_t atime, mtime; // maybe atime different? "lazy"? - - int mask; + // nlink + int nlink; + bool anchored; // auth only? + // file (data access) + off_t size, max_size; + utime_t mtime; // file data modify time. + utime_t atime; // file data access time. + // special stuff + int mask; // used for client stat. hack. version_t version; // auth only - unsigned char hash_seed; // only defined for dir; 0 if not hashed. - bool anchored; // auth only version_t file_data_version; // auth only bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } + bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } + bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } }; -// client types -typedef int fh_t; // file handle // dentries @@ -272,6 +282,11 @@ typedef int fh_t; // file handle // -- io helpers -- +template +inline ostream& operator<<(ostream& out, pair v) { + return out << v.first << "," << v.second; +} + template inline ostream& operator<<(ostream& out, vector& v) { out << "["; @@ -283,6 +298,17 @@ inline ostream& operator<<(ostream& out, vector& v) { return out; } +template +inline ostream& operator<<(ostream& out, const list& ilist) { + for (typename list::const_iterator it = ilist.begin(); + it != ilist.end(); + it++) { + if (it != ilist.begin()) out << ","; + out << *it; + } + return out; +} + template inline ostream& operator<<(ostream& out, const set& iset) { for (typename set::const_iterator it = iset.begin(); @@ -321,47 +347,4 @@ inline ostream& operator<<(ostream& out, const map& m) - -// -- rope helpers -- - -// string -inline void _rope(string& s, crope& r) -{ - r.append(s.c_str(), s.length()+1); -} -inline void _unrope(string& s, crope& r, int& off) -{ - s = r.c_str() + off; - off += s.length() + 1; -} - -// set -inline void _rope(set& s, crope& r) -{ - int n = s.size(); - r.append((char*)&n, sizeof(n)); - for (set::iterator it = s.begin(); - it != s.end(); - it++) { - int v = *it; - r.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -inline void _unrope(set& s, crope& r, int& off) -{ - s.clear(); - int n; - r.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __UTIME_H +#define __UTIME_H + +#include + + +// -------- +// utime_t + +typedef struct timeval _utime_t; + +class utime_t { + private: + struct timeval tv; + + struct timeval& timeval() { return tv; } + friend class Clock; + + + public: + void normalize() { + if (tv.tv_usec > 1000*1000) { + tv.tv_sec += tv.tv_usec / (1000*1000); + tv.tv_usec %= 1000*1000; + } + } + + // cons + utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } + //utime_t(time_t s) { tv.tv_sec = s; tv.tv_usec = 0; } + utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } + utime_t(const _utime_t &v) : tv(v) {} + /* + utime_t(double d) { + tv.tv_sec = (time_t)trunc(d); + tv.tv_usec = (__suseconds_t)((d - tv.tv_sec) / (double)1000000.0); + } + */ + + // accessors + time_t sec() const { return tv.tv_sec; } + long usec() const { return tv.tv_usec; } + int nsec() const { return tv.tv_usec*1000; } + + // ref accessors/modifiers + time_t& sec_ref() { return tv.tv_sec; } + // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. + // is just casting it to long& OK? + long& usec_ref() { return (long&) tv.tv_usec; } + + // cast to double + operator double() { + return (double)sec() + ((double)usec() / 1000000.0L); + } +}; + +// arithmetic operators +inline utime_t operator+(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, + (l.usec()+r.usec())%1000000L ); +} +inline utime_t& operator+=(utime_t& l, const utime_t& r) { + l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; + l.usec_ref() += r.usec(); + l.usec_ref() %= 1000000L; + return l; +} +inline utime_t& operator+=(utime_t& l, double f) { + double fs = trunc(f); + double us = (f - fs) / (double)1000000.0; + l.sec_ref() += (long)fs; + l.usec_ref() += (long)us; + l.normalize(); + return l; +} + +inline utime_t operator-(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) + l.usec_ref() -= r.usec(); + else { + l.usec_ref() += 1000000L - r.usec(); + l.sec_ref()--; + } + return l; +} +inline utime_t& operator-=(utime_t& l, double f) { + l += -f; + return l; +} + +inline bool operator>(const utime_t& a, const utime_t& b) +{ + return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); +} +inline bool operator<(const utime_t& a, const utime_t& b) +{ + return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); +} + +// ostream +inline std::ostream& operator<<(std::ostream& out, const utime_t& t) +{ + //return out << t.sec() << "." << t.usec(); + out << (long)t.sec() << "."; + out.setf(std::ios::right); + out.fill('0'); + out << std::setw(6) << t.usec(); + out.unsetf(std::ios::right); + return out; + + //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); +} + +#endif diff --git a/trunk/ceph/mds/Anchor.h b/trunk/ceph/mds/Anchor.h index 8da2bbdb52cd5..ba0092d2c4611 100644 --- a/trunk/ceph/mds/Anchor.h +++ b/trunk/ceph/mds/Anchor.h @@ -18,38 +18,90 @@ using std::string; #include "include/types.h" +#include "mdstypes.h" #include "include/buffer.h" + +// anchor ops +#define ANCHOR_OP_LOOKUP 1 +#define ANCHOR_OP_LOOKUP_REPLY 2 + +#define ANCHOR_OP_CREATE_PREPARE 11 +#define ANCHOR_OP_CREATE_AGREE 12 + +#define ANCHOR_OP_DESTROY_PREPARE 21 +#define ANCHOR_OP_DESTROY_AGREE 22 + +#define ANCHOR_OP_UPDATE_PREPARE 31 +#define ANCHOR_OP_UPDATE_AGREE 32 + +#define ANCHOR_OP_COMMIT 41 +#define ANCHOR_OP_ACK 42 +#define ANCHOR_OP_ROLLBACK 43 + + + +inline const char* get_anchor_opname(int o) { + switch (o) { + case ANCHOR_OP_LOOKUP: return "lookup"; + case ANCHOR_OP_LOOKUP_REPLY: return "lookup_reply"; + + case ANCHOR_OP_CREATE_PREPARE: return "create_prepare"; + case ANCHOR_OP_CREATE_AGREE: return "create_agree"; + case ANCHOR_OP_DESTROY_PREPARE: return "destroy_prepare"; + case ANCHOR_OP_DESTROY_AGREE: return "destroy_agree"; + case ANCHOR_OP_UPDATE_PREPARE: return "update_prepare"; + case ANCHOR_OP_UPDATE_AGREE: return "update_agree"; + + case ANCHOR_OP_COMMIT: return "commit"; + case ANCHOR_OP_ACK: return "ack"; + case ANCHOR_OP_ROLLBACK: return "rollback"; + default: assert(0); + } +} + + +// identifies a anchor table mutation + + + +// anchor type + class Anchor { public: - inodeno_t ino; // my ino - inodeno_t dirino; // containing dir - string ref_dn; // referring dentry + inodeno_t ino; // anchored ino + dirfrag_t dirfrag; // containing dirfrag + //string ref_dn; // referring dentry int nref; // reference count Anchor() {} - Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) { - this->ino = ino; - this->dirino = dirino; - this->ref_dn = ref_dn; - this->nref = nref; - } + Anchor(inodeno_t i, dirfrag_t df, + //string& rd, + int nr=0) : + ino(i), dirfrag(df), + //ref_dn(rd), + nref(nr) { } void _encode(bufferlist &bl) { bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&dirfrag, sizeof(dirfrag)); bl.append((char*)&nref, sizeof(nref)); - ::_encode(ref_dn, bl); + //::_encode(ref_dn, bl); } void _decode(bufferlist& bl, int& off) { bl.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); + bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); bl.copy(off, sizeof(nref), (char*)&nref); off += sizeof(nref); - ::_decode(ref_dn, bl, off); + //::_decode(ref_dn, bl, off); } -} ; +}; + +inline ostream& operator<<(ostream& out, Anchor& a) +{ + return out << "a(" << a.ino << " " << a.dirfrag << " " << a.nref << ")"; +} #endif diff --git a/trunk/ceph/mds/AnchorClient.cc b/trunk/ceph/mds/AnchorClient.cc index af84eb6c2448a..d7bfb655f06d8 100644 --- a/trunk/ceph/mds/AnchorClient.cc +++ b/trunk/ceph/mds/AnchorClient.cc @@ -24,21 +24,22 @@ using std::endl; #include "msg/Messenger.h" #include "MDS.h" +#include "MDLog.h" -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" +#include "events/EAnchorClient.h" +#include "messages/MAnchor.h" #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " +#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " +#define derr(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " void AnchorClient::dispatch(Message *m) { switch (m->get_type()) { - case MSG_MDS_ANCHORREPLY: - handle_anchor_reply((MAnchorReply*)m); + case MSG_MDS_ANCHOR: + handle_anchor_reply((MAnchor*)m); break; default: @@ -46,47 +47,165 @@ void AnchorClient::dispatch(Message *m) } } -void AnchorClient::handle_anchor_reply(class MAnchorReply *m) +void AnchorClient::handle_anchor_reply(class MAnchor *m) { + inodeno_t ino = m->get_ino(); + version_t atid = m->get_atid(); + + dout(10) << "handle_anchor_reply " << *m << endl; + switch (m->get_op()) { - case ANCHOR_OP_LOOKUP: + // lookup + case ANCHOR_OP_LOOKUP_REPLY: + assert(pending_lookup.count(ino)); { - assert(pending_lookup_trace.count(m->get_ino()) == 1); + *pending_lookup[ino].trace = m->get_trace(); + Context *onfinish = pending_lookup[ino].onfinish; + pending_lookup.erase(ino); + + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } + } + break; + + // prepare -> agree + case ANCHOR_OP_CREATE_AGREE: + if (pending_create_prepare.count(ino)) { + dout(10) << "got create_agree on " << ino << " atid " << atid << endl; + Context *onfinish = pending_create_prepare[ino].onfinish; + *pending_create_prepare[ino].patid = atid; + pending_create_prepare.erase(ino); + + pending_commit.insert(atid); + + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } + } + else if (pending_commit.count(atid)) { + dout(10) << "stray create_agree on " << ino + << " atid " << atid + << ", already committing, resending COMMIT" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + else { + dout(10) << "stray create_agree on " << ino + << " atid " << atid + << ", sending ROLLBACK" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + break; - *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace(); - Context *onfinish = pending_lookup_context[ m->get_ino() ]; + case ANCHOR_OP_DESTROY_AGREE: + if (pending_destroy_prepare.count(ino)) { + dout(10) << "got destroy_agree on " << ino << " atid " << atid << endl; + Context *onfinish = pending_destroy_prepare[ino].onfinish; + *pending_destroy_prepare[ino].patid = atid; + pending_destroy_prepare.erase(ino); - pending_lookup_trace.erase(m->get_ino()); - pending_lookup_context.erase(m->get_ino()); + pending_commit.insert(atid); if (onfinish) { onfinish->finish(0); delete onfinish; } + } + else if (pending_commit.count(atid)) { + dout(10) << "stray destroy_agree on " << ino + << " atid " << atid + << ", already committing, resending COMMIT" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + else { + dout(10) << "stray destroy_agree on " << ino + << " atid " << atid + << ", sending ROLLBACK" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); } break; - case ANCHOR_OP_UPDATE: - case ANCHOR_OP_CREATE: - case ANCHOR_OP_DESTROY: - { - assert(pending_op.count(m->get_ino()) == 1); + case ANCHOR_OP_UPDATE_AGREE: + if (pending_update_prepare.count(ino)) { + dout(10) << "got update_agree on " << ino << " atid " << atid << endl; + Context *onfinish = pending_update_prepare[ino].onfinish; + *pending_update_prepare[ino].patid = atid; + pending_update_prepare.erase(ino); - Context *onfinish = pending_op[m->get_ino()]; - pending_op.erase(m->get_ino()); + pending_commit.insert(atid); if (onfinish) { onfinish->finish(0); delete onfinish; } } + else if (pending_commit.count(atid)) { + dout(10) << "stray update_agree on " << ino + << " atid " << atid + << ", already committing, resending COMMIT" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + else { + dout(10) << "stray update_agree on " << ino + << " atid " << atid + << ", sending ROLLBACK" + << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + break; + + // commit -> ack + case ANCHOR_OP_ACK: + { + dout(10) << "got ack on atid " << atid << ", logging" << endl; + + // remove from committing list + assert(pending_commit.count(atid)); + pending_commit.erase(atid); + + // log ACK. + mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid)); + + // kick any waiters + if (ack_waiters.count(atid)) { + dout(15) << "kicking waiters on atid " << atid << endl; + mds->queue_waiters(ack_waiters[atid]); + ack_waiters.erase(atid); + } + } break; default: assert(0); } + delete m; } @@ -95,55 +214,158 @@ void AnchorClient::handle_anchor_reply(class MAnchorReply *m) * public async interface */ -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) + +/* + * FIXME: we need to be able to resubmit messages if the anchortable mds fails. + */ + + +void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) { // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino); + MAnchor *req = new MAnchor(ANCHOR_OP_LOOKUP, ino); - pending_lookup_trace[ino] = &trace; - pending_lookup_context[ino] = onfinish; + assert(pending_lookup.count(ino) == 0); + pending_lookup[ino].onfinish = onfinish; + pending_lookup[ino].trace = &trace; - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); + mds->send_message_mds(req, + mds->mdsmap->get_anchortable(), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); } -void AnchorClient::create(inodeno_t ino, vector& trace, Context *onfinish) + +// PREPARE + +void AnchorClient::prepare_create(inodeno_t ino, vector& trace, + version_t *patid, Context *onfinish) { + dout(10) << "prepare_create " << ino << " " << trace << endl; + // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino); + MAnchor *req = new MAnchor(ANCHOR_OP_CREATE_PREPARE, ino); req->set_trace(trace); - pending_op[ino] = onfinish; + pending_create_prepare[ino].trace = trace; + pending_create_prepare[ino].patid = patid; + pending_create_prepare[ino].onfinish = onfinish; + + mds->send_message_mds(req, + mds->mdsmap->get_anchortable(), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); +} + +void AnchorClient::prepare_destroy(inodeno_t ino, + version_t *patid, Context *onfinish) +{ + dout(10) << "prepare_destroy " << ino << endl; - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); + // send message + MAnchor *req = new MAnchor(ANCHOR_OP_DESTROY_PREPARE, ino); + pending_destroy_prepare[ino].onfinish = onfinish; + pending_destroy_prepare[ino].patid = patid; + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); } -void AnchorClient::update(inodeno_t ino, vector& trace, Context *onfinish) + +void AnchorClient::prepare_update(inodeno_t ino, vector& trace, + version_t *patid, Context *onfinish) { + dout(10) << "prepare_update " << ino << " " << trace << endl; + // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino); + MAnchor *req = new MAnchor(ANCHOR_OP_UPDATE_PREPARE, ino); req->set_trace(trace); - pending_op[ino] = onfinish; + pending_update_prepare[ino].trace = trace; + pending_update_prepare[ino].patid = patid; + pending_update_prepare[ino].onfinish = onfinish; - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); } -void AnchorClient::destroy(inodeno_t ino, Context *onfinish) + +// COMMIT + +void AnchorClient::commit(version_t atid) { + dout(10) << "commit " << atid << endl; + + assert(pending_commit.count(atid)); + pending_commit.insert(atid); + // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino); + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); + mds->messenger->send_message(req, + mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); +} - pending_op[ino] = onfinish; - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); + +// RECOVERY + +void AnchorClient::finish_recovery() +{ + dout(7) << "finish_recovery" << endl; + + resend_commits(); +} + +void AnchorClient::resend_commits() +{ + for (set::iterator p = pending_commit.begin(); + p != pending_commit.end(); + ++p) { + dout(10) << "resending commit on " << *p << endl; + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, *p); + mds->send_message_mds(req, + mds->mdsmap->get_anchortable(), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } } +void AnchorClient::resend_prepares(hash_map& prepares, int op) +{ + for (hash_map::iterator p = prepares.begin(); + p != prepares.end(); + p++) { + dout(10) << "resending " << get_anchor_opname(op) << " on " << p->first << endl; + MAnchor *req = new MAnchor(op, p->first); + req->set_trace(p->second.trace); + mds->send_message_mds(req, + mds->mdsmap->get_anchortable(), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } +} + + +void AnchorClient::handle_mds_recovery(int who) +{ + dout(7) << "handle_mds_recovery mds" << who << endl; + + if (who != mds->mdsmap->get_anchortable()) + return; // do nothing. + + // resend any pending lookups. + for (hash_map::iterator p = pending_lookup.begin(); + p != pending_lookup.end(); + p++) { + dout(10) << "resending lookup on " << p->first << endl; + mds->send_message_mds(new MAnchor(ANCHOR_OP_LOOKUP, p->first), + mds->mdsmap->get_anchortable(), + MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); + } + + // resend any pending prepares. + resend_prepares(pending_create_prepare, ANCHOR_OP_CREATE_PREPARE); + resend_prepares(pending_update_prepare, ANCHOR_OP_UPDATE_PREPARE); + resend_prepares(pending_destroy_prepare, ANCHOR_OP_DESTROY_PREPARE); + // resend any pending commits. + resend_commits(); +} diff --git a/trunk/ceph/mds/AnchorClient.h b/trunk/ceph/mds/AnchorClient.h index 80b736a4b65c7..ae62608ce2982 100644 --- a/trunk/ceph/mds/AnchorClient.h +++ b/trunk/ceph/mds/AnchorClient.h @@ -24,32 +24,71 @@ using __gnu_cxx::hash_map; #include "Anchor.h" -class Messenger; -class MDSMap; class Context; +class MDS; class AnchorClient : public Dispatcher { - Messenger *messenger; - MDSMap *mdsmap; + MDS *mds; - // remote state - hash_map pending_op; - hash_map pending_lookup_context; - hash_map*> pending_lookup_trace; + // lookups + struct _pending_lookup { + vector *trace; + Context *onfinish; + }; + hash_map pending_lookup; - void handle_anchor_reply(class MAnchorReply *m); + // prepares + struct _pending_prepare { + vector trace; + Context *onfinish; + version_t *patid; // ptr to atid + }; + hash_map pending_create_prepare; + hash_map pending_destroy_prepare; + hash_map pending_update_prepare; + // pending commits + set pending_commit; + map > ack_waiters; + + void handle_anchor_reply(class MAnchor *m); public: - AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {} + AnchorClient(MDS *m) : mds(m) {} + void dispatch(Message *m); + // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - void create(inodeno_t ino, vector& trace, Context *onfinish); - void update(inodeno_t ino, vector& trace, Context *onfinish); - void destroy(inodeno_t ino, Context *onfinish); + void lookup(inodeno_t ino, vector& trace, Context *onfinish); + + void prepare_create(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); + void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); + void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); + + void commit(version_t atid); + + // for recovery (by other nodes) + void handle_mds_recovery(int mds); // called when someone else recovers + + void resend_commits(); + void resend_prepares(hash_map& prepares, int op); + + // for recovery (by me) + void got_journaled_agree(version_t atid) { + pending_commit.insert(atid); + } + void got_journaled_ack(version_t atid) { + pending_commit.erase(atid); + } + bool has_committed(version_t atid) { + return pending_commit.count(atid) == 0; + } + void wait_for_ack(version_t atid, Context *c) { + ack_waiters[atid].push_back(c); + } + void finish_recovery(); // called when i recover and go active + - void dispatch(Message *m); }; #endif diff --git a/trunk/ceph/mds/AnchorTable.cc b/trunk/ceph/mds/AnchorTable.cc index 6f380b0908d8d..48c2dc2226397 100644 --- a/trunk/ceph/mds/AnchorTable.cc +++ b/trunk/ceph/mds/AnchorTable.cc @@ -17,107 +17,93 @@ #include "osdc/Filer.h" #include "msg/Messenger.h" -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" +#include "messages/MAnchor.h" #include "common/Clock.h" +#include "MDLog.h" +#include "events/EAnchor.h" + #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " +#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " +#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " -AnchorTable::AnchorTable(MDS *mds) -{ - this->mds = mds; - opening = false; - opened = false; -} -void AnchorTable::init_inode() +void AnchorTable::dump() { - memset(&table_inode, 0, sizeof(table_inode)); - table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid(); - table_inode.layout = g_OSD_FileLayout; + dout(7) << "dump v " << version << endl; + for (hash_map::iterator it = anchor_map.begin(); + it != anchor_map.end(); + it++) + dout(15) << "dump " << it->second << endl; } -void AnchorTable::reset() -{ - init_inode(); - opened = true; - anchor_map.clear(); -} /* * basic updates */ -bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn) +bool AnchorTable::add(inodeno_t ino, dirfrag_t dirfrag) { - dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl; + //dout(17) << "add " << ino << " dirfrag " << dirfrag << endl; // parent should be there - assert(dirino < 1000 || // system dirino - anchor_map.count(dirino)); // have + assert(dirfrag.ino < MDS_INO_BASE || // system dirino + anchor_map.count(dirfrag.ino)); // have if (anchor_map.count(ino) == 0) { // new item - anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn); - dout(10) << " add: added " << std::hex << ino << std::dec << endl; + anchor_map[ino] = Anchor(ino, dirfrag); + dout(7) << "add added " << anchor_map[ino] << endl; return true; } else { - dout(10) << " add: had " << std::hex << ino << std::dec << endl; + dout(7) << "add had " << anchor_map[ino] << endl; return false; } } void AnchorTable::inc(inodeno_t ino) { - dout(7) << "inc " << std::hex << ino << std::dec << endl; + dout(7) << "inc " << ino << endl; - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); + assert(anchor_map.count(ino)); while (1) { - anchor->nref++; + Anchor &anchor = anchor_map[ino]; + anchor.nref++; - dout(10) << " inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; + dout(10) << "inc now " << anchor << endl; + ino = anchor.dirfrag.ino; if (ino == 0) break; if (anchor_map.count(ino) == 0) break; - anchor = anchor_map[ino]; - assert(anchor); } } void AnchorTable::dec(inodeno_t ino) { - dout(7) << "dec " << std::hex << ino << std::dec << endl; + dout(7) << "dec " << ino << endl; - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); + assert(anchor_map.count(ino)); + Anchor &anchor = anchor_map[ino]; while (true) { - anchor->nref--; + anchor.nref--; - if (anchor->nref == 0) { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl; - inodeno_t dirino = anchor->dirino; + if (anchor.nref == 0) { + dout(10) << "dec removing " << anchor << endl; + dirfrag_t dirfrag = anchor.dirfrag; anchor_map.erase(ino); - delete anchor; - ino = dirino; + ino = dirfrag.ino; } else { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; + dout(10) << "dec now " << anchor << endl; + ino = anchor.dirfrag.ino; } if (ino == 0) break; if (anchor_map.count(ino) == 0) break; anchor = anchor_map[ino]; - assert(anchor); } } @@ -126,40 +112,332 @@ void AnchorTable::dec(inodeno_t ino) * high level */ -void AnchorTable::lookup(inodeno_t ino, vector& trace) + +// LOOKUP + +void AnchorTable::handle_lookup(MAnchor *req) { - dout(7) << "lookup " << std::hex << ino << std::dec << endl; + inodeno_t ino = req->get_ino(); + dout(7) << "handle_lookup " << ino << endl; assert(anchor_map.count(ino) == 1); - Anchor *anchor = anchor_map[ino]; - assert(anchor); + Anchor &anchor = anchor_map[ino]; + vector trace; while (true) { - dout(10) << " record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl; + dout(10) << "handle_lookup adding " << anchor << endl; trace.insert(trace.begin(), anchor); // lame FIXME - if (anchor->dirino < MDS_INO_BASE) break; + if (anchor.dirfrag.ino < MDS_INO_BASE) break; - assert(anchor_map.count(anchor->dirino) == 1); - anchor = anchor_map[anchor->dirino]; - assert(anchor); + assert(anchor_map.count(anchor.dirfrag.ino) == 1); + anchor = anchor_map[anchor.dirfrag.ino]; } + + // reply + MAnchor *reply = new MAnchor(ANCHOR_OP_LOOKUP_REPLY, ino); + reply->set_trace(trace); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + + delete req; } -void AnchorTable::create(inodeno_t ino, vector& trace) + +// MIDLEVEL + +void AnchorTable::create_prepare(inodeno_t ino, vector& trace, int reqmds) { - dout(7) << "create " << std::hex << ino << std::dec << endl; - // make sure trace is in table for (unsigned i=0; iino, trace[i]->dirino, trace[i]->ref_dn); + add(trace[i].ino, trace[i].dirfrag); + inc(ino); + + version++; + pending_create[version] = ino; // so we can undo + pending_reqmds[version] = reqmds; + //dump(); +} + +void AnchorTable::destroy_prepare(inodeno_t ino, int reqmds) +{ + version++; + pending_destroy[version] = ino; + pending_reqmds[version] = reqmds; + //dump(); +} + +void AnchorTable::update_prepare(inodeno_t ino, vector& trace, int reqmds) +{ + version++; + pending_update[version].first = ino; + pending_update[version].second = trace; + pending_reqmds[version] = reqmds; + //dump(); +} + +void AnchorTable::commit(version_t atid) +{ + if (pending_create.count(atid)) { + dout(7) << "commit " << atid << " create " << pending_create[atid] << endl; + pending_create.erase(atid); + } + + else if (pending_destroy.count(atid)) { + inodeno_t ino = pending_destroy[atid]; + dout(7) << "commit " << atid << " destroy " << ino << endl; + + dec(ino); // destroy + + pending_destroy.erase(atid); + } + + else if (pending_update.count(atid)) { + inodeno_t ino = pending_update[atid].first; + vector &trace = pending_update[atid].second; + + dout(7) << "commit " << atid << " update " << ino << endl; + + // remove old + dec(ino); + + // add new + for (unsigned i=0; i_create_prepare_logged(req, atid); + } +}; + +void AnchorTable::handle_create_prepare(MAnchor *req) +{ + inodeno_t ino = req->get_ino(); + vector& trace = req->get_trace(); + + dout(7) << "handle_create_prepare " << ino << endl; + + create_prepare(ino, trace, req->get_source().num()); + + // log it + EAnchor *le = new EAnchor(ANCHOR_OP_CREATE_PREPARE, ino, version, req->get_source().num()); + le->set_trace(trace); + mds->mdlog->submit_entry(le, + new C_AT_CreatePrepare(this, req, version)); +} + +void AnchorTable::_create_prepare_logged(MAnchor *req, version_t atid) +{ + inodeno_t ino = req->get_ino(); + dout(7) << "_create_prepare_logged " << ino << " atid " << atid << endl; + + // reply + MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, ino, atid); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + + delete req; +} + + + + +// DESTROY + +class C_AT_DestroyPrepare : public Context { + AnchorTable *at; + MAnchor *req; + version_t atid; +public: + C_AT_DestroyPrepare(AnchorTable *a, MAnchor *r, version_t t) : + at(a), req(r), atid(t) { } + void finish(int r) { + at->_destroy_prepare_logged(req, atid); + } +}; + +void AnchorTable::handle_destroy_prepare(MAnchor *req) +{ + inodeno_t ino = req->get_ino(); + dout(7) << "handle_destroy_prepare " << ino << endl; + + destroy_prepare(ino, req->get_source().num()); + + mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_DESTROY_PREPARE, ino, version, req->get_source().num()), + new C_AT_DestroyPrepare(this, req, version)); +} + +void AnchorTable::_destroy_prepare_logged(MAnchor *req, version_t atid) +{ + inodeno_t ino = req->get_ino(); + dout(7) << "_destroy_prepare_logged " << ino << " atid " << atid << endl; + + // reply + MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, ino, atid); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + delete req; +} + + + +// UPDATE + +class C_AT_UpdatePrepare : public Context { + AnchorTable *at; + MAnchor *req; + version_t atid; +public: + C_AT_UpdatePrepare(AnchorTable *a, MAnchor *r, version_t t) : + at(a), req(r), atid(t) { } + void finish(int r) { + at->_update_prepare_logged(req, atid); + } +}; + +void AnchorTable::handle_update_prepare(MAnchor *req) +{ + inodeno_t ino = req->get_ino(); + vector& trace = req->get_trace(); + + dout(7) << "handle_update_prepare " << ino << endl; + + update_prepare(ino, trace, req->get_source().num()); - inc(ino); // ok! + // log it + EAnchor *le = new EAnchor(ANCHOR_OP_UPDATE_PREPARE, ino, version, req->get_source().num()); + le->set_trace(trace); + mds->mdlog->submit_entry(le, + new C_AT_UpdatePrepare(this, req, version)); +} + +void AnchorTable::_update_prepare_logged(MAnchor *req, version_t atid) +{ + inodeno_t ino = req->get_ino(); + dout(7) << "_update_prepare_logged " << ino << " atid " << atid << endl; + + // reply + MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, ino, atid); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + delete req; +} + + + +// COMMIT + +class C_AT_Commit : public Context { + AnchorTable *at; + MAnchor *req; +public: + C_AT_Commit(AnchorTable *a, MAnchor *r) : + at(a), req(r) { } + void finish(int r) { + at->_commit_logged(req); + } +}; + +void AnchorTable::handle_commit(MAnchor *req) +{ + version_t atid = req->get_atid(); + dout(7) << "handle_commit " << atid << endl; + + if (pending_create.count(atid) || + pending_destroy.count(atid) || + pending_update.count(atid)) { + commit(atid); + mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_COMMIT, atid, version)); + } + else if (atid <= version) { + dout(0) << "got commit for atid " << atid << " <= " << version + << ", already committed, sending ack." + << endl; + MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, 0, atid); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + delete req; + return; + } + else { + // wtf. + dout(0) << "got commit for atid " << atid << " > " << version << endl; + assert(atid <= version); + } + + // wait for it to journal + mds->mdlog->wait_for_sync(new C_AT_Commit(this, req)); +} + + +void AnchorTable::_commit_logged(MAnchor *req) +{ + dout(7) << "_commit_logged, sending ACK" << endl; + MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, req->get_ino(), req->get_atid()); + mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); + delete req; } -void AnchorTable::destroy(inodeno_t ino) + + +// ROLLBACK + +void AnchorTable::handle_rollback(MAnchor *req) { - dec(ino); + version_t atid = req->get_atid(); + dout(7) << "handle_rollback " << atid << endl; + rollback(atid); + delete req; } @@ -171,8 +449,8 @@ void AnchorTable::destroy(inodeno_t ino) void AnchorTable::dispatch(Message *m) { switch (m->get_type()) { - case MSG_MDS_ANCHORREQUEST: - handle_anchor_request((MAnchorRequest*)m); + case MSG_MDS_ANCHOR: + handle_anchor_request((MAnchor*)m); break; default: @@ -181,14 +459,13 @@ void AnchorTable::dispatch(Message *m) } - -void AnchorTable::handle_anchor_request(class MAnchorRequest *m) +void AnchorTable::handle_anchor_request(class MAnchor *req) { // make sure i'm open! if (!opened) { dout(7) << "not open yet" << endl; - waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m)); + waiting_for_open.push_back(new C_MDS_RetryMessage(mds, req)); if (!opening) { opening = true; @@ -197,35 +474,37 @@ void AnchorTable::handle_anchor_request(class MAnchorRequest *m) return; } + dout(10) << "handle_anchor_request " << *req << endl; + // go - MAnchorReply *reply = new MAnchorReply(m); - - switch (m->get_op()) { + switch (req->get_op()) { case ANCHOR_OP_LOOKUP: - lookup( m->get_ino(), reply->get_trace() ); + handle_lookup(req); break; - case ANCHOR_OP_UPDATE: - destroy( m->get_ino() ); - create( m->get_ino(), m->get_trace() ); + case ANCHOR_OP_CREATE_PREPARE: + handle_create_prepare(req); + break; + case ANCHOR_OP_DESTROY_PREPARE: + handle_destroy_prepare(req); + break; + case ANCHOR_OP_UPDATE_PREPARE: + handle_update_prepare(req); break; - case ANCHOR_OP_CREATE: - create( m->get_ino(), m->get_trace() ); + case ANCHOR_OP_COMMIT: + handle_commit(req); break; - case ANCHOR_OP_DESTROY: - destroy( m->get_ino() ); + case ANCHOR_OP_ROLLBACK: + handle_rollback(req); break; default: assert(0); } - // send reply - mds->messenger->send_message(reply, m->get_source_inst(), m->get_source_port()); - delete m; } @@ -235,38 +514,83 @@ void AnchorTable::handle_anchor_request(class MAnchorRequest *m) // load/save entire table for now! +class C_AT_Saved : public Context { + AnchorTable *at; + version_t version; +public: + C_AT_Saved(AnchorTable *a, version_t v) : at(a), version(v) {} + void finish(int r) { + at->_saved(version); + } +}; + void AnchorTable::save(Context *onfinish) { - dout(7) << "save" << endl; - if (!opened) return; + dout(7) << "save v " << version << endl; + if (!opened) { + assert(!onfinish); + return; + } + if (onfinish) + waiting_for_save[version].push_back(onfinish); + + if (committing_version == version) { + dout(7) << "save already committing v " << version << endl; + return; + } + committing_version = version; + // build up write - bufferlist tabbl; + bufferlist bl; - int num = anchor_map.size(); - tabbl.append((char*)&num, sizeof(int)); + // version + bl.append((char*)&version, sizeof(version)); - for (hash_map::iterator it = anchor_map.begin(); + // # anchors + size_t size = anchor_map.size(); + bl.append((char*)&size, sizeof(size)); + + // anchors + for (hash_map::iterator it = anchor_map.begin(); it != anchor_map.end(); it++) { - dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl; - Anchor *a = it->second; - assert(a); - a->_encode(tabbl); + it->second._encode(bl); + dout(15) << "save encoded " << it->second << endl; } - bufferlist bl; - size_t size = tabbl.length(); - bl.append((char*)&size, sizeof(size)); - bl.claim_append(tabbl); - - dout(7) << " " << num << " anchors, " << size << " bytes" << endl; + // pending + ::_encode(pending_reqmds, bl); + ::_encode(pending_create, bl); + ::_encode(pending_destroy, bl); + size_t s = pending_update.size(); + bl.append((char*)&s, sizeof(s)); + for (map > >::iterator p = pending_update.begin(); + p != pending_update.end(); + ++p) { + bl.append((char*)&p->first, sizeof(p->first)); + bl.append((char*)&p->second.first, sizeof(p->second.first)); + ::_encode(p->second.second, bl); + } + // write! - mds->filer->write(table_inode, - 0, bl.length(), - bl, 0, - NULL, onfinish); + mds->objecter->write(object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0), + 0, bl.length(), + bl, + NULL, new C_AT_Saved(this, version)); +} + +void AnchorTable::_saved(version_t v) +{ + dout(7) << "_saved v " << v << endl; + + assert(v <= committing_version); + assert(committed_version < v); + committed_version = v; + + finish_contexts(waiting_for_save[v], 0); + waiting_for_save.erase(v); } @@ -274,85 +598,114 @@ void AnchorTable::save(Context *onfinish) class C_AT_Load : public Context { AnchorTable *at; public: - size_t size; bufferlist bl; - C_AT_Load(size_t size, AnchorTable *at) { - this->size = size; - this->at = at; - } + C_AT_Load(AnchorTable *a) : at(a) {} void finish(int result) { assert(result > 0); - - at->load_2(size, bl); - } -}; - -class C_AT_LoadSize : public Context { - AnchorTable *at; - MDS *mds; -public: - bufferlist bl; - C_AT_LoadSize(AnchorTable *at, MDS *mds) { - this->at = at; - this->mds = mds; - } - void finish(int r) { - size_t size = 0; - assert(bl.length() >= sizeof(size)); - bl.copy(0, sizeof(size), (char*)&size); - cout << "r is " << r << " size is " << size << endl; - if (r > 0 && size > 0) { - C_AT_Load *c = new C_AT_Load(size, at); - mds->filer->read(at->table_inode, - sizeof(size), size, - &c->bl, - c); - } else { - // fail - bufferlist empty; - at->load_2(0, empty); - } + at->_loaded(bl); } }; void AnchorTable::load(Context *onfinish) { dout(7) << "load" << endl; - init_inode(); - assert(!opened); waiting_for_open.push_back(onfinish); - - C_AT_LoadSize *c = new C_AT_LoadSize(this, mds); - mds->filer->read(table_inode, - 0, sizeof(size_t), - &c->bl, - c); + + C_AT_Load *fin = new C_AT_Load(this); + mds->objecter->read(object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0), + 0, 0, &fin->bl, fin); } -void AnchorTable::load_2(size_t size, bufferlist& bl) +void AnchorTable::_loaded(bufferlist& bl) { - // num + dout(10) << "_loaded got " << bl.length() << " bytes" << endl; + int off = 0; - int num; - bl.copy(0, sizeof(num), (char*)&num); - off += sizeof(num); - - // parse anchors - for (int i=0; i_decode(bl, off); - dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl; - anchor_map[a->ino] = a; + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + + size_t size; + bl.copy(off, sizeof(size), (char*)&size); + off += sizeof(size); + + for (size_t n=0; n::iterator p = pending_reqmds.begin(); + p != pending_reqmds.end(); + p++) + resend_agree(p->first, p->second); +} + + +void AnchorTable::resend_agree(version_t v, int who) +{ + if (pending_create.count(v)) { + MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, pending_create[v], v); + mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); + } + else if (pending_destroy.count(v)) { + MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, pending_destroy[v], v); + mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); + } + else { + assert(pending_update.count(v)); + MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, pending_update[v].first, v); + mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); + } +} + +void AnchorTable::handle_mds_recovery(int who) +{ + dout(7) << "handle_mds_recovery mds" << who << endl; + + // resend agrees for recovered mds + for (map::iterator p = pending_reqmds.begin(); + p != pending_reqmds.end(); + p++) { + if (p->second != who) continue; + resend_agree(p->first, p->second); + } +} diff --git a/trunk/ceph/mds/AnchorTable.h b/trunk/ceph/mds/AnchorTable.h index 0b0af03af5b68..6ef4e9d47bc32 100644 --- a/trunk/ceph/mds/AnchorTable.h +++ b/trunk/ceph/mds/AnchorTable.h @@ -22,59 +22,104 @@ using namespace __gnu_cxx; class MDS; - +class MAnchor; class AnchorTable { MDS *mds; - hash_map anchor_map; - bool opening, opened; - list waiting_for_open; + // keep the entire table in memory. + hash_map anchor_map; - public: - inode_t table_inode; + // uncommitted operations + map pending_reqmds; + map pending_create; + map pending_destroy; + map > > pending_update; - public: - AnchorTable(MDS *mds); + version_t version; // this includes anchor_map AND pending_* state. + version_t committing_version; + version_t committed_version; - protected: - void init_inode(); // call this before doing anything. + // load/save state + bool opening, opened; - // - bool have_ino(inodeno_t ino) { - return true; // always in memory for now. - } - void fetch_ino(inodeno_t ino, Context *onfinish) { - assert(!opened); - load(onfinish); - } + // waiters + list waiting_for_open; + map > waiting_for_save; + +protected: - // adjust table - bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn); + // basic updates + bool add(inodeno_t ino, dirfrag_t dirfrag); void inc(inodeno_t ino); void dec(inodeno_t ino); - + // mid-level + void create_prepare(inodeno_t ino, vector& trace, int reqmds); + void destroy_prepare(inodeno_t ino, int reqmds); + void update_prepare(inodeno_t ino, vector& trace, int reqmds); + void commit(version_t atid); + void rollback(version_t atid); + friend class EAnchor; // used for journal replay. + // high level interface - void lookup(inodeno_t ino, vector& trace); - void create(inodeno_t ino, vector& trace); - void destroy(inodeno_t ino); + void handle_lookup(MAnchor *req); + + void handle_create_prepare(MAnchor *req); + void _create_prepare_logged(MAnchor *req, version_t atid); + friend class C_AT_CreatePrepare; + + void handle_destroy_prepare(MAnchor *req); + void _destroy_prepare_logged(MAnchor *req, version_t atid); + friend class C_AT_DestroyPrepare; + + void handle_update_prepare(MAnchor *req); + void _update_prepare_logged(MAnchor *req, version_t atid); + friend class C_AT_UpdatePrepare; + + void handle_commit(MAnchor *req); + void _commit_logged(MAnchor *req); + friend class C_AT_Commit; + + void handle_rollback(MAnchor *req); // messages - public: - void dispatch(class Message *m); - protected: - void handle_anchor_request(class MAnchorRequest *m); + void handle_anchor_request(MAnchor *m); + void dump(); - public: +public: + AnchorTable(MDS *m) : + mds(m), + version(0), committing_version(0), committed_version(0), + opening(false), opened(false) { } + + void dispatch(class Message *m); + + version_t get_version() { return version; } + version_t get_committed_version() { return committed_version; } + + void create_fresh() { + // reset (i.e. on mkfs) to empty, but unsaved table. + version = 1; + opened = true; + opening = false; + anchor_map.clear(); + pending_create.clear(); + pending_destroy.clear(); + pending_update.clear(); + } // load/save entire table for now! - void reset(); void save(Context *onfinish); + void _saved(version_t v); void load(Context *onfinish); - void load_2(size_t size, bufferlist& bl); + void _loaded(bufferlist& bl); + // recovery + void handle_mds_recovery(int who); + void finish_recovery(); + void resend_agree(version_t v, int who); }; diff --git a/trunk/ceph/mds/CDentry.cc b/trunk/ceph/mds/CDentry.cc index 22d292a001e33..40629f1c99f9e 100644 --- a/trunk/ceph/mds/CDentry.cc +++ b/trunk/ceph/mds/CDentry.cc @@ -16,10 +16,13 @@ #include "CDentry.h" #include "CInode.h" #include "CDir.h" +#include "Anchor.h" #include "MDS.h" #include "MDCache.h" +#include "messages/MLock.h" + #include #undef dout @@ -47,11 +50,7 @@ ostream& operator<<(ostream& out, CDentry& dn) if (dn.is_null()) out << " NULL"; if (dn.is_remote()) out << " REMOTE"; - if (dn.is_pinned()) out << " " << dn.num_pins() << " pathpins"; - - if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; - if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); - if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); + out << " " << dn.lock; out << " v=" << dn.get_version(); out << " pv=" << dn.get_projected_version(); @@ -60,10 +59,7 @@ ostream& operator<<(ostream& out, CDentry& dn) if (dn.get_num_ref()) { out << " |"; - for(set::iterator it = dn.get_ref_set().begin(); - it != dn.get_ref_set().end(); - it++) - out << " " << CDentry::pin_name(*it); + dn.print_pin_set(out); } out << " " << &dn; @@ -71,8 +67,19 @@ ostream& operator<<(ostream& out, CDentry& dn) return out; } -CDentry::CDentry(const CDentry& m) { - assert(0); //std::cerr << "copy cons called, implement me" << endl; + +bool operator<(const CDentry& l, const CDentry& r) +{ + if (l.get_dir()->ino() < r.get_dir()->ino()) return true; + if (l.get_dir()->ino() == r.get_dir()->ino() && + l.get_name() < r.get_name()) return true; + return false; +} + + +void CDentry::print(ostream& out) +{ + out << *this; } @@ -84,16 +91,15 @@ inodeno_t CDentry::get_ino() } -int CDentry::authority() +pair CDentry::authority() { - return dir->dentry_authority( name ); + return dir->authority(); } -version_t CDentry::pre_dirty() +version_t CDentry::pre_dirty(version_t min) { - // NOTE: in the future, this will dirty a particular slice/subset of the dir. - projected_version = dir->pre_dirty(); + projected_version = dir->pre_dirty(min); dout(10) << " pre_dirty " << *this << endl; return projected_version; } @@ -138,8 +144,7 @@ void CDentry::mark_clean() { void CDentry::make_path(string& s) { if (dir) { - if (dir->inode->get_parent_dn()) - dir->inode->get_parent_dn()->make_path(s); + dir->inode->make_path(s); } else { s = "???"; } @@ -147,6 +152,21 @@ void CDentry::make_path(string& s) s += name; } +/** make_anchor_trace + * construct an anchor trace for this dentry, as if it were linked to *in. + */ +void CDentry::make_anchor_trace(vector& trace, CInode *in) +{ + // start with parent dir inode + if (dir) + dir->inode->make_anchor_trace(trace); + + // add this inode (in my dirfrag) to the end + trace.push_back(Anchor(in->ino(), dir->dirfrag())); + dout(10) << "make_anchor_trace added " << trace.back() << endl; +} + + void CDentry::link_remote(CInode *in) { @@ -174,30 +194,20 @@ CDentryDiscover *CDentry::replicate_to(int who) } +// ---------------------------- +// locking +void CDentry::set_mlock_info(MLock *m) +{ + m->set_dn(dir->dirfrag(), name); +} -// = -const CDentry& CDentry::operator= (const CDentry& right) { - assert(0); //std::cerr << "copy op called, implement me" << endl; - return *this; +void CDentry::encode_lock_state(int type, bufferlist& bl) +{ + } - // comparisons - bool CDentry::operator== (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator!= (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator< (const CDentry& right) const { - return name < right.name; - } - bool CDentry::operator> (const CDentry& right) const { - return name > right.name; - } - bool CDentry::operator>= (const CDentry& right) const { - return name >= right.name; - } - bool CDentry::operator<= (const CDentry& right) const { - return name <= right.name; - } +void CDentry::decode_lock_state(int type, bufferlist& bl) +{ + +} diff --git a/trunk/ceph/mds/CDentry.h b/trunk/ceph/mds/CDentry.h index 65b9155ce69f9..9480a320cddbe 100644 --- a/trunk/ceph/mds/CDentry.h +++ b/trunk/ceph/mds/CDentry.h @@ -26,41 +26,44 @@ using namespace std; #include "include/lru.h" #include "mdstypes.h" +#include "SimpleLock.h" + class CInode; class CDir; - -#define DN_LOCK_SYNC 0 -#define DN_LOCK_PREXLOCK 1 -#define DN_LOCK_XLOCK 2 -#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. - -#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. +class MDRequest; class Message; class CDentryDiscover; +class Anchor; + +class CDentry; + +// define an ordering +bool operator<(const CDentry& l, const CDentry& r); // dentry class CDentry : public MDSCacheObject, public LRUObject { public: - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_DIRTY = (1<<1); - - // pins - static const int PIN_INODEPIN = 0; // linked inode is pinned - static const int PIN_REPLICATED = 1; // replicated by another MDS - static const int PIN_DIRTY = 2; // - static const int PIN_PROXY = 3; // - static const char *pin_name(int p) { + // -- state -- + + // -- pins -- + static const int PIN_INODEPIN = 1; // linked inode is pinned + const char *pin_name(int p) { switch (p) { case PIN_INODEPIN: return "inodepin"; - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - default: assert(0); + default: return generic_pin_name(p); } }; + // -- wait -- + static const int WAIT_LOCK_OFFSET = 8; + + + static const int EXPORT_NONCE = 1; + + bool is_lt(const MDSCacheObject *r) const { + return *this < *(CDentry*)r; + } protected: string name; @@ -72,14 +75,6 @@ class CDentry : public MDSCacheObject, public LRUObject { version_t version; // dir version when last touched. version_t projected_version; // what it will be when i unlock/commit. - // locking - int lockstate; - Message *xlockedby; - set gather_set; - - // path pins - int npins; - multiset pinset; friend class Migrator; friend class Locker; @@ -90,6 +85,13 @@ class CDentry : public MDSCacheObject, public LRUObject { friend class CInode; friend class C_MDC_XlockRequest; + +public: + // lock + SimpleLock lock; + + + public: // cons CDentry() : @@ -98,9 +100,7 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(0), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } + lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } CDentry(const string& n, inodeno_t ino, CInode *in=0) : name(n), inode(in), @@ -108,9 +108,7 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(ino), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } + lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } CDentry(const string& n, CInode *in) : name(n), inode(in), @@ -118,13 +116,11 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(0), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } + lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CInode *get_inode() { return inode; } - CDir *get_dir() { return dir; } - const string& get_name() { return name; } + CInode *get_inode() const { return inode; } + CDir *get_dir() const { return dir; } + const string& get_name() const { return name; } inodeno_t get_ino(); inodeno_t get_remote_ino() { return remote_ino; } @@ -155,30 +151,19 @@ class CDentry : public MDSCacheObject, public LRUObject { CDentry(const CDentry& m); const CDentry& operator= (const CDentry& right); - // comparisons - bool operator== (const CDentry& right) const; - bool operator!= (const CDentry& right) const; - bool operator< (const CDentry& right) const; - bool operator> (const CDentry& right) const; - bool operator>= (const CDentry& right) const; - bool operator<= (const CDentry& right) const; - // misc void make_path(string& p); + void make_anchor_trace(vector& trace, CInode *in); - // -- state + // -- version -- version_t get_version() { return version; } void set_version(version_t v) { projected_version = version = v; } version_t get_projected_version() { return projected_version; } void set_projected_version(version_t v) { projected_version = v; } - int authority(); + pair authority(); - bool is_auth() { return state & STATE_AUTH; } - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - - version_t pre_dirty(); + version_t pre_dirty(version_t min=0); void _mark_dirty(); void mark_dirty(version_t projected_dirv); void mark_clean(); @@ -188,55 +173,57 @@ class CDentry : public MDSCacheObject, public LRUObject { CDentryDiscover *replicate_to(int rep); - // -- locking - int get_lockstate() { return lockstate; } - set& get_gather_set() { return gather_set; } - - bool is_sync() { return lockstate == DN_LOCK_SYNC; } - bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } - bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); } - bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } - Message* get_xlockedby() { return xlockedby; } - bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } - bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } - bool is_prexlockbyother(Message *m) { - return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; + // -- exporting + // note: this assumes the dentry already exists. + // i.e., the name is already extracted... so we just need the other state. + void encode_export_state(bufferlist& bl) { + bl.append((char*)&state, sizeof(state)); + bl.append((char*)&version, sizeof(version)); + bl.append((char*)&projected_version, sizeof(projected_version)); + lock._encode(bl); + ::_encode(replicas, bl); + + // twiddle + clear_replicas(); + replica_nonce = EXPORT_NONCE; + state_clear(CDentry::STATE_AUTH); + if (is_dirty()) + mark_clean(); + } + void decode_import_state(bufferlist& bl, int& off, int from, int to) { + int nstate; + bl.copy(off, sizeof(nstate), (char*)&nstate); + off += sizeof(nstate); + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + bl.copy(off, sizeof(projected_version), (char*)&projected_version); + off += sizeof(projected_version); + lock._decode(bl, off); + ::_decode(replicas, bl, off); + + // twiddle + state = 0; + state_set(CDentry::STATE_AUTH); + if (nstate & STATE_DIRTY) + _mark_dirty(); + if (!replicas.empty()) + get(PIN_REPLICATED); + add_replica(from, EXPORT_NONCE); + if (is_replica(to)) + remove_replica(to); } - int get_replica_lockstate() { - switch (lockstate) { - case DN_LOCK_XLOCK: - case DN_LOCK_SYNC: - return lockstate; - case DN_LOCK_PREXLOCK: - return DN_LOCK_XLOCK; - case DN_LOCK_UNPINNING: - return DN_LOCK_SYNC; - } - assert(0); - return 0; + // -- locking -- + SimpleLock* get_lock(int type) { + assert(type == LOCK_OTYPE_DN); + return &lock; } - void set_lockstate(int s) { lockstate = s; } + void set_mlock_info(MLock *m); + void encode_lock_state(int type, bufferlist& bl); + void decode_lock_state(int type, bufferlist& bl); + - // path pins - void pin(Message *m) { - npins++; - pinset.insert(m); - assert(pinset.size() == (unsigned)npins); - } - void unpin(Message *m) { - npins--; - assert(npins >= 0); - assert(pinset.count(m) > 0); - pinset.erase(pinset.find(m)); - assert(pinset.size() == (unsigned)npins); - } - bool is_pinnable(Message *m) { - return (lockstate == DN_LOCK_SYNC) || - (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); - } - bool is_pinned() { return npins>0; } - int num_pins() { return npins; } + void print(ostream& out); friend class CDir; }; @@ -244,6 +231,7 @@ class CDentry : public MDSCacheObject, public LRUObject { ostream& operator<<(ostream& out, CDentry& dn); + class CDentryDiscover { string dname; int replica_nonce; @@ -256,16 +244,23 @@ public: CDentryDiscover() {} CDentryDiscover(CDentry *dn, int nonce) : dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->get_replica_lockstate()), + lockstate(dn->lock.get_replica_state()), ino(dn->get_ino()), remote_ino(dn->get_remote_ino()) { } string& get_dname() { return dname; } int get_nonce() { return replica_nonce; } + bool is_remote() { return remote_ino ? true:false; } + inodeno_t get_remote_ino() { return remote_ino; } void update_dentry(CDentry *dn) { dn->set_replica_nonce( replica_nonce ); - dn->set_lockstate( lockstate ); + if (remote_ino) + dn->set_remote_ino(remote_ino); + } + void update_new_dentry(CDentry *dn) { + update_dentry(dn); + dn->lock.set_state( lockstate ); } void _encode(bufferlist& bl) { @@ -285,4 +280,5 @@ public: }; + #endif diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc index c9b9996d91c2d..cd12e96e09891 100644 --- a/trunk/ceph/mds/CDir.cc +++ b/trunk/ceph/mds/CDir.cc @@ -24,11 +24,9 @@ #include "include/Context.h" #include "common/Clock.h" -#include +#include "osdc/Objecter.h" -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " +#include // PINS @@ -40,84 +38,98 @@ ostream& operator<<(ostream& out, CDir& dir) { string path; dir.get_inode()->make_path(path); - out << "[dir " << dir.ino() << " " << path << "/"; + out << "[dir " << dir.ino(); + if (!dir.frag.is_root()) out << "%" << dir.frag; + out << " " << path << "/"; if (dir.is_auth()) { out << " auth"; if (dir.is_replicated()) out << dir.get_replicas(); - out << " v=" << dir.get_version(); out << " pv=" << dir.get_projected_version(); + out << " v=" << dir.get_version(); out << " cv=" << dir.get_committing_version(); - out << " lastcv=" << dir.get_last_committed_version(); + out << "/" << dir.get_committed_version(); } else { out << " rep@" << dir.authority(); if (dir.get_replica_nonce() > 1) out << "." << dir.get_replica_nonce(); } - if (dir.get_dir_auth() != CDIR_AUTH_PARENT) - out << " dir_auth=" << dir.get_dir_auth(); + if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { + if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) + out << " dir_auth=" << dir.get_dir_auth().first; + else + out << " dir_auth=" << dir.get_dir_auth(); + } + if (dir.get_cum_auth_pins()) + out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); + out << " state=" << dir.get_state(); - if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy"; - if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree"; - if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir"; + if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; + if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; + if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; + //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; + if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; + if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; + if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; + if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); if (dir.get_num_ref()) { out << " |"; - for(set::iterator it = dir.get_ref_set().begin(); - it != dir.get_ref_set().end(); - it++) - out << " " << CDir::pin_name(*it); + dir.print_pin_set(out); } out << " " << &dir; return out << "]"; } +void CDir::print(ostream& out) +{ + out << *this; +} + + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") " +//#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache." << *this << " " + // ------------------------------------------------------------------- // CDir -CDir::CDir(CInode *in, MDCache *mdcache, bool auth) +CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) { inode = in; + frag = fg; this->cache = mdcache; nitems = 0; nnull = 0; - state = CDIR_STATE_INITIAL; + state = STATE_INITIAL; projected_version = version = 0; committing_version = 0; - last_committed_version = 0; + committed_version = 0; - ref = 0; + // dir_auth + dir_auth = CDIR_AUTH_DEFAULT; // auth - dir_auth = -1; assert(in->is_dir()); if (auth) - state |= CDIR_STATE_AUTH; - /* - if (in->dir_is_hashed()) { - assert(0); // when does this happen? - state |= CDIR_STATE_HASHED; - } - */ + state |= STATE_AUTH; auth_pins = 0; nested_auth_pins = 0; request_pins = 0; - dir_rep = CDIR_REP_NONE; + dir_rep = REP_NONE; + //dir_rep = REP_ALL; // hack: to wring out some bugs! FIXME FIXME } @@ -143,7 +155,7 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth) // add to dir assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); + //assert(null_items.count(dn->name) == 0); items[dn->name] = dn; nitems++; @@ -154,7 +166,7 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth) if (nnull + nitems == 1) get(PIN_CHILD); assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); + //assert(nnull == null_items.size()); return dn; } @@ -175,7 +187,7 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) // add to dir assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); + //assert(null_items.count(dn->name) == 0); items[dn->name] = dn; @@ -183,7 +195,7 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) link_inode_work( dn, in ); } else { assert(dn->inode == 0); - null_items[dn->name] = dn; + //null_items[dn->name] = dn; nnull++; } @@ -193,7 +205,7 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) if (nnull + nitems == 1) get(PIN_CHILD); assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); + //assert(nnull == null_items.size()); return dn; } @@ -208,8 +220,8 @@ void CDir::remove_dentry(CDentry *dn) unlink_inode_work(dn); } else { // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); + //assert(null_items.count(dn->name) == 1); + //null_items.erase(dn->name); nnull--; } @@ -224,7 +236,7 @@ void CDir::remove_dentry(CDentry *dn) if (nnull + nitems == 0) put(PIN_CHILD); assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); + //assert(nnull == null_items.size()); } void CDir::link_inode( CDentry *dn, inodeno_t ino) @@ -235,9 +247,10 @@ void CDir::link_inode( CDentry *dn, inodeno_t ino) dn->set_remote_ino(ino); nitems++; - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); + //assert(null_items.count(dn->name) == 1); + //null_items.erase(dn->name); nnull--; + assert(nnull + nitems == items.size()); } void CDir::link_inode( CDentry *dn, CInode *in ) @@ -248,12 +261,12 @@ void CDir::link_inode( CDentry *dn, CInode *in ) link_inode_work(dn,in); // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); + //assert(null_items.count(dn->name) == 1); + //null_items.erase(dn->name); nnull--; assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); + //assert(nnull == null_items.size()); } void CDir::link_inode_work( CDentry *dn, CInode *in ) @@ -263,12 +276,9 @@ void CDir::link_inode_work( CDentry *dn, CInode *in ) nitems++; // adjust dir size - // set dir version - in->inode.version = dn->get_version(); + // set inode version + //in->inode.version = dn->get_version(); - // clear dangling - in->state_clear(CInode::STATE_DANGLING); - // pin dentry? if (in->get_num_ref()) dn->get(CDentry::PIN_INODEPIN); @@ -280,17 +290,21 @@ void CDir::link_inode_work( CDentry *dn, CInode *in ) void CDir::unlink_inode( CDentry *dn ) { - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; + if (dn->is_remote()) { + dout(12) << "unlink_inode " << *dn << endl; + } else { + dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; + } unlink_inode_work(dn); // add to null list - assert(null_items.count(dn->name) == 0); - null_items[dn->name] = dn; + //assert(null_items.count(dn->name) == 0); + //null_items[dn->name] = dn; nnull++; assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); + //assert(nnull == null_items.size()); } void CDir::unlink_inode_work( CDentry *dn ) @@ -307,10 +321,6 @@ void CDir::unlink_inode_work( CDentry *dn ) // primary assert(dn->is_primary()); - // explicitly define auth - in->dangling_auth = in->authority(); - //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; - // unpin dentry? if (in->get_num_ref()) dn->put(CDentry::PIN_INODEPIN); @@ -319,9 +329,6 @@ void CDir::unlink_inode_work( CDentry *dn ) if (in->auth_pins + in->nested_auth_pins) adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); - // set dangling flag - in->state_set(CInode::STATE_DANGLING); - // detach inode in->remove_primary_parent(dn); dn->inode = 0; @@ -334,136 +341,107 @@ void CDir::remove_null_dentries() { dout(12) << "remove_null_dentries " << *this << endl; list dns; - for (CDir_map_t::iterator it = null_items.begin(); - it != null_items.end(); + for (CDir_map_t::iterator it = items.begin(); + it != items.end(); it++) { - dns.push_back(it->second); + if (it->second->is_null()) + dns.push_back(it->second); } - + for (list::iterator it = dns.begin(); it != dns.end(); it++) { CDentry *dn = *it; - assert(dn->is_sync()); remove_dentry(dn); } - assert(null_items.empty()); + //assert(null_items.empty()); assert(nnull == 0); assert(nnull + nitems == items.size()); } + +CDirDiscover *CDir::replicate_to(int mds) +{ + assert(is_auth()); + return new CDirDiscover( this, add_replica(mds) ); +} + + + + + /**************************************** * WAITING */ -bool CDir::waiting_for(int tag) +void CDir::add_dentry_waiter(const string& dname, Context *c) { - return waiting.count(tag) > 0; + if (waiting_on_dentry.empty()) + get(PIN_DNWAITER); + waiting_on_dentry[dname].push_back(c); + dout(10) << "add_dentry_waiter dentry " << dname << " " << c << " on " << *this << endl; } -bool CDir::waiting_for(int tag, const string& dn) +void CDir::take_dentry_waiting(const string& dname, list& ls) { - if (!waiting_on_dentry.count(dn)) - return false; - return waiting_on_dentry[dn].count(tag) > 0; + if (waiting_on_dentry.empty()) return; + if (waiting_on_dentry.count(dname) == 0) return; + dout(10) << "take_dentry_waiting dentry " << dname + << " x " << waiting_on_dentry[dname].size() + << " on " << *this << endl; + ls.splice(ls.end(), waiting_on_dentry[dname]); + waiting_on_dentry.erase(dname); + if (waiting_on_dentry.empty()) + put(PIN_DNWAITER); } -void CDir::add_waiter(int tag, - const string& dentry, - Context *c) { - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting_on_dentry[ dentry ].insert(pair(tag,c)); - dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; -} -void CDir::add_waiter(int tag, Context *c) { +void CDir::add_waiter(int tag, Context *c) +{ // hierarchical? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - if (is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir()) { - // it's us, pin here. (fall thru) - } else { - // pin parent! + + // at free root? + if (tag & WAIT_ATFREEZEROOT) { + if (!(is_freezing_tree_root() || is_frozen_tree_root() || + is_freezing_dir() || is_frozen_dir())) { + // try parent dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl; inode->parent->dir->add_waiter(tag, c); return; } } - - // this dir. - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; -} - - -void CDir::take_waiting(int mask, - const string& dentry, - list& ls, - int num) -{ - if (waiting_on_dentry.empty()) return; - multimap::iterator it = waiting_on_dentry[dentry].begin(); - while (it != waiting_on_dentry[dentry].end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting_on_dentry[dentry].erase(it++); - - if (num) { - if (num == 1) break; - num--; - } - } else { - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; + // at subtree root? + if (tag & WAIT_ATSUBTREEROOT) { + if (!is_subtree_root()) { + // try parent + dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << endl; + inode->parent->dir->add_waiter(tag, c); + return; } } - // did we clear dentry? - if (waiting_on_dentry[dentry].empty()) - waiting_on_dentry.erase(dentry); - - // ...whole map? - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); + MDSCacheObject::add_waiter(tag, c); } + + /* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, - list& ls) +void CDir::take_waiting(int mask, list& ls) { - if (waiting_on_dentry.size()) { - // try each dentry - hash_map >::iterator it = + if (mask & WAIT_DENTRY) { + // take each each dentry waiter + hash_map >::iterator it = waiting_on_dentry.begin(); while (it != waiting_on_dentry.end()) { - take_waiting(mask, (it++)->first, ls); // not post-inc + take_dentry_waiting((it++)->first, ls); // not post-inc } } // waiting - if (!waiting.empty()) { - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl; - it++; - } - } - - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); - } + MDSCacheObject::take_waiting(mask, ls); } @@ -473,23 +451,18 @@ void CDir::finish_waiting(int mask, int result) list finished; take_waiting(mask, finished); - finish_contexts(finished, result); + //finish_contexts(finished, result); + cache->mds->queue_waiters(finished); } -void CDir::finish_waiting(int mask, const string& dn, int result) -{ - dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, dn, finished); - finish_contexts(finished, result); -} // dirty/clean -version_t CDir::pre_dirty() +version_t CDir::pre_dirty(version_t min) { + if (min > projected_version) + projected_version = min; ++projected_version; dout(10) << "pre_dirty " << projected_version << endl; return projected_version; @@ -497,8 +470,8 @@ version_t CDir::pre_dirty() void CDir::_mark_dirty() { - if (!state_test(CDIR_STATE_DIRTY)) { - state_set(CDIR_STATE_DIRTY); + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl; get(PIN_DIRTY); } else { @@ -508,16 +481,16 @@ void CDir::_mark_dirty() void CDir::mark_dirty(version_t pv) { - ++version; - assert(pv == version); + assert(version < pv); + version = pv; _mark_dirty(); } void CDir::mark_clean() { dout(10) << "mark_clean " << *this << " version " << version << endl; - if (state_test(CDIR_STATE_DIRTY)) { - state_clear(CDIR_STATE_DIRTY); + if (state_test(STATE_DIRTY)) { + state_clear(STATE_DIRTY); put(PIN_DIRTY); } } @@ -537,68 +510,584 @@ void CDir::last_put() -/******************************** - * AUTHORITY +/****************************************************************************** + * FETCH and COMMIT */ -/* - * simple rule: if dir_auth isn't explicit, auth is the same as the inode. +// ----------------------- +// FETCH + +class C_Dir_Fetch : public Context { + protected: + CDir *dir; + public: + bufferlist bl; + + C_Dir_Fetch(CDir *d) : dir(d) { } + void finish(int result) { + dir->_fetched(bl); + } +}; + +void CDir::fetch(Context *c) +{ + dout(10) << "fetch on " << *this << endl; + + assert(is_auth()); + assert(!is_complete()); + + if (c) add_waiter(WAIT_COMPLETE, c); + + // already fetching? + if (state_test(CDir::STATE_FETCHING)) { + dout(7) << "already fetching; waiting" << endl; + return; + } + + state_set(CDir::STATE_FETCHING); + + if (cache->mds->logger) cache->mds->logger->inc("fdir"); + + // start by reading the first hunk of it + C_Dir_Fetch *fin = new C_Dir_Fetch(this); + cache->mds->objecter->read( get_ondisk_object(), + 0, 0, // whole object + &fin->bl, + fin ); +} + +void CDir::_fetched(bufferlist &bl) +{ + dout(10) << "_fetched " << 0 << "~" << bl.length() + << " on " << *this + << endl; + + // give up? + if (!is_auth() || is_frozen()) { + dout(10) << "_fetched canceling (!auth or frozen)" << endl; + //ondisk_bl.clear(); + //ondisk_size = 0; + + // kick waiters? + state_clear(CDir::STATE_FETCHING); + finish_waiting(WAIT_COMPLETE, -1); + return; + } + + // decode. + int len = bl.length(); + int off = 0; + version_t got_version; + + bl.copy(off, sizeof(got_version), (char*)&got_version); + off += sizeof(got_version); + + dout(10) << "_fetched version " << got_version + << ", " << len << " bytes" + << endl; + + while (off < len) { + // marker + char type = bl[off]; + ++off; + + // dname + string dname; + ::_decode(dname, bl, off); + dout(24) << "_fetched parsed marker '" << type << "' dname '" << dname << "'" << endl; + + CDentry *dn = lookup(dname); // existing dentry? + + if (type == 'L') { + // hard link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + if (dn) { + if (dn->get_inode() == 0) { + dout(12) << "_fetched had NEG dentry " << *dn << endl; + } else { + dout(12) << "_fetched had dentry " << *dn << endl; + } + } else { + // (remote) link + CDentry *dn = add_dentry( dname, ino ); + + // link to inode? + CInode *in = cache->get_inode(ino); // we may or may not have it. + if (in) { + dn->link_remote(in); + dout(12) << "_fetched got remote link " << ino << " which we have " << *in << endl; + } else { + dout(12) << "_fetched got remote link " << ino << " (dont' have it)" << endl; + } + } + } + else if (type == 'I') { + // inode + + // parse out inode + inode_t inode; + bl.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); + + string symlink; + if (inode.is_symlink()) + ::_decode(symlink, bl, off); + + fragtree_t fragtree; + fragtree._decode(bl,off); + + if (dn) { + if (dn->get_inode() == 0) { + dout(12) << "_fetched had NEG dentry " << *dn << endl; + } else { + dout(12) << "_fetched had dentry " << *dn << endl; + } + } else { + // add inode + CInode *in = 0; + if (cache->have_inode(inode.ino)) { + in = cache->get_inode(inode.ino); + dout(12) << "_fetched got (but i already had) " << *in + << " mode " << in->inode.mode + << " mtime " << in->inode.mtime << endl; + assert(0); // this shouldn't happen!! + } else { + // inode + in = new CInode(cache); + in->inode = inode; + + // symlink? + if (in->is_symlink()) + in->symlink = symlink; + + // dirfragtree + in->dirfragtree.swap(fragtree); + + // add + cache->add_inode( in ); + + // link + add_dentry( dname, in ); + dout(12) << "_fetched got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; + } + } + } else { + dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) + << " at pos " << off << endl; + assert(0); + } + + /** clean underwater item? + * Underwater item is something that is dirty in our cache from + * journal replay, but was previously flushed to disk before the + * mds failed. + * + * We only do this is committed_version == 0. that implies either + * - this is a fetch after from a clean/empty CDir is created + * (and has no effect, since the dn won't exist); or + * - this is a fetch after _recovery_, which is what we're worried + * about. Items that are marked dirty from the journal should be + * marked clean if they appear on disk. + */ + if (committed_version == 0 && + dn && + dn->get_version() <= got_version && + dn->is_dirty()) { + dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << endl; + dn->mark_clean(); + + if (dn->get_inode()) { + assert(dn->get_inode()->get_version() <= got_version); + dout(10) << "_fetched had underwater inode " << *dn->get_inode() << ", marking clean" << endl; + dn->get_inode()->mark_clean(); + } + } + } + assert(off == len); + + // take the loaded version? + // only if we are a fresh CDir* with no prior state. + if (version == 0) { + assert(projected_version == 0); + assert(!state_test(STATE_COMMITTING)); + projected_version = version = committing_version = committed_version = got_version; + } + + // mark complete, !fetching + state_set(STATE_COMPLETE); + state_clear(STATE_FETCHING); + + // kick waiters + finish_waiting(WAIT_COMPLETE, 0); + /* + list waiters; + take_waiting(WAIT_COMPLETE, waiters); + cache->mds->queue_finished(waiters); + */ +} + + + +// ----------------------- +// COMMIT + +/** + * commit + * + * @param want min version i want committed + * @param c callback for completion */ -int CDir::authority() +void CDir::commit(version_t want, Context *c) { - if (dir_auth == CDIR_AUTH_PARENT) - return inode->authority(); - return dir_auth; + dout(10) << "commit want " << want << " on " << *this << endl; + if (want == 0) want = version; + + // preconditions + assert(want <= version || version == 0); // can't commit the future + assert(committed_version < want); // the caller is stupid + assert(is_auth()); + assert(can_auth_pin()); + + // note: queue up a noop if necessary, so that we always + // get an auth_pin. + if (!c) + c = new C_NoopContext; + + // auth_pin on first waiter + if (waiting_for_commit.empty()) + auth_pin(); + waiting_for_commit[want].push_back(c); + + // ok. + _commit(want); } -int CDir::dentry_authority(const string& dn ) + +class C_Dir_RetryCommit : public Context { + CDir *dir; + version_t want; +public: + C_Dir_RetryCommit(CDir *d, version_t v) : + dir(d), want(v) { } + void finish(int r) { + dir->_commit(want); + } +}; + +class C_Dir_Committed : public Context { + CDir *dir; + version_t version; +public: + C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } + void finish(int r) { + dir->_committed(version); + } +}; + +void CDir::_commit(version_t want) { - // hashing -- subset of nodes have hashed the contents - if (is_hashing() && !hashed_subset.empty()) { - int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed - if (hashed_subset.count(hashauth)) - return hashauth; + dout(10) << "_commit want " << want << " on " << *this << endl; + + // we can't commit things in the future. + // (even the projected future.) + assert(want <= version || version == 0); + + // check pre+postconditions. + assert(is_auth()); + + // already committed? + if (committed_version >= want) { + dout(10) << "already committed " << committed_version << " >= " << want << endl; + return; } + // already committing >= want? + if (committing_version >= want) { + dout(10) << "already committing " << committing_version << " >= " << want << endl; + assert(state_test(STATE_COMMITTING)); + return; + } + + // complete? + if (!is_complete()) { + dout(7) << "commit not complete, fetching first" << endl; + fetch(new C_Dir_RetryCommit(this, want)); + return; + } + + // commit. + committing_version = version; - // hashed - if (is_hashed()) { - return cache->hash_dentry( inode->ino(), dn ); // hashed + // mark committing (if not already) + if (!state_test(STATE_COMMITTING)) { + dout(10) << "marking committing" << endl; + state_set(STATE_COMMITTING); } - if (get_dir_auth() == CDIR_AUTH_PARENT) { - //dout(15) << "dir_auth = parent at " << *this << endl; - return inode->authority(); // same as my inode + if (cache->mds->logger) cache->mds->logger->inc("cdir"); + + // encode dentries + bufferlist bl; + bl.append((char*)&version, sizeof(version)); + + for (CDir_map_t::iterator it = items.begin(); + it != items.end(); + it++) { + CDentry *dn = it->second; + + if (dn->is_null()) + continue; // skip negative entries + + // primary or remote? + if (dn->is_remote()) { + inodeno_t ino = dn->get_remote_ino(); + dout(14) << " pos " << bl.length() << " dn '" << it->first << "' remote ino " << ino << endl; + + // marker, name, ino + bl.append( "L", 1 ); // remote link + bl.append( it->first.c_str(), it->first.length() + 1); + bl.append((char*)&ino, sizeof(ino)); + } else { + // primary link + CInode *in = dn->get_inode(); + assert(in); + + dout(14) << " pos " << bl.length() << " dn '" << it->first << "' inode " << *in << endl; + + // marker, name, inode, [symlink string] + bl.append( "I", 1 ); // inode + bl.append( it->first.c_str(), it->first.length() + 1); + bl.append( (char*) &in->inode, sizeof(inode_t)); + + if (in->is_symlink()) { + // include symlink destination! + dout(18) << " inlcuding symlink ptr " << in->symlink << endl; + bl.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); + } + + in->dirfragtree._encode(bl); + } } - // it's explicit for this whole dir - //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl; - return get_dir_auth(); + // write it. + cache->mds->objecter->write( get_ondisk_object(), + 0, bl.length(), + bl, + NULL, new C_Dir_Committed(this, version) ); +} + + +/** + * _committed + * + * @param v version i just committed + */ +void CDir::_committed(version_t v) +{ + dout(10) << "_committed v " << v << " on " << *this << endl; + assert(is_auth()); + + // take note. + assert(v > committed_version); + assert(v <= committing_version); + committed_version = v; + + // _all_ commits done? + if (committing_version == committed_version) + state_clear(CDir::STATE_COMMITTING); + + // dir clean? + if (committed_version == version) + mark_clean(); + + // dentries clean? + for (CDir_map_t::iterator it = items.begin(); + it != items.end(); ) { + CDentry *dn = it->second; + it++; + + // dentry + if (committed_version >= dn->get_version()) { + if (dn->is_dirty()) { + dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; + dn->mark_clean(); + } + } else { + dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; + } + + // inode? + if (dn->is_primary()) { + CInode *in = dn->get_inode(); + assert(in); + assert(in->is_auth()); + + if (committed_version >= in->get_version()) { + if (in->is_dirty()) { + dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; + in->mark_clean(); + } + } else { + dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; + assert(in->is_dirty()); + } + } + } + + // finishers? + bool were_waiters = !waiting_for_commit.empty(); + + map >::iterator p = waiting_for_commit.begin(); + while (p != waiting_for_commit.end()) { + map >::iterator n = p; + n++; + if (p->first > committed_version) break; // haven't committed this far yet. + cache->mds->queue_waiters(p->second); + waiting_for_commit.erase(p); + p = n; + } + + // unpin if we kicked the last waiter. + if (were_waiters && + waiting_for_commit.empty()) + auth_unpin(); +} + + + + + + +/******************************** + * AUTHORITY + */ + +/* + * if dir_auth.first == parent, auth is same as inode. + * unless .second != unknown, in which case that sticks. + */ +pair CDir::authority() +{ + if (is_subtree_root()) + return dir_auth; + else + return inode->authority(); +} + +/** is_subtree_root() + * true if this is an auth delegation point. + * that is, dir_auth != default (parent,unknown) + * + * some key observations: + * if i am auth: + * - any region bound will be an export, or frozen. + * + * note that this DOES heed dir_auth.pending + */ +bool CDir::is_subtree_root() +{ + if (dir_auth == CDIR_AUTH_DEFAULT) { + //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT + //<< " on " << ino() << endl; + return false; + } else { + //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT + //<< " on " << ino() << endl; + return true; + } } -void CDir::set_dir_auth(int d) + + + +/** set_dir_auth + * + * always list ourselves first. + * + * accept 'iamauth' param so that i can intelligently adjust freeze auth_pins + * even when the auth bit isn't correct. + * as when calling MDCache::import_subtree(...). + */ +void CDir::set_dir_auth(pair a, bool iamauth) { - dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl; - dir_auth = d; + dout(10) << "setting dir_auth=" << a + << " from " << dir_auth + << " on " << *this << endl; + + bool was_subtree = is_subtree_root(); + bool was_ambiguous = dir_auth.second >= 0; + + // set it. + dir_auth = a; + + // new subtree root? + if (!was_subtree && is_subtree_root()) { + dout(10) << " new subtree root, adjusting auth_pins" << endl; + + // adjust nested auth pins + inode->adjust_nested_auth_pins(-get_cum_auth_pins()); + + // unpin parent of frozen dir/tree? + if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) + inode->auth_unpin(); + } + if (was_subtree && !is_subtree_root()) { + dout(10) << " old subtree root, adjusting auth_pins" << endl; + + // adjust nested auth pins + inode->adjust_nested_auth_pins(get_cum_auth_pins()); + + // pin parent of frozen dir/tree? + if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) + inode->auth_pin(); + } + + // newly single auth? + if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { + list ls; + take_waiting(WAIT_SINGLEAUTH, ls); + cache->mds->queue_waiters(ls); + } } /***************************************** - * AUTH PINS + * AUTH PINS and FREEZING + * + * the basic plan is that auth_pins only exist in auth regions, and they + * prevent a freeze (and subsequent auth change). + * + * however, we also need to prevent a parent from freezing if a child is frozen. + * for that reason, the parent inode of a frozen directory is auth_pinned. + * + * the oddity is when the frozen directory is a subtree root. if that's the case, + * the parent inode isn't frozen. which means that when subtree authority is adjusted + * at the bounds, inodes for any frozen bound directories need to get auth_pins at that + * time. + * */ -void CDir::auth_pin() { +void CDir::auth_pin() +{ if (auth_pins == 0) get(PIN_AUTHPIN); auth_pins++; dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - + + // nest pins? + if (is_subtree_root()) return; // no. + //assert(!is_import()); + inode->nested_auth_pins++; if (inode->parent) inode->parent->dir->adjust_nested_auth_pins( 1 ); } -void CDir::auth_unpin() { +void CDir::auth_unpin() +{ auth_pins--; if (auth_pins == 0) put(PIN_AUTHPIN); @@ -607,9 +1096,13 @@ void CDir::auth_unpin() { assert(auth_pins >= 0); // pending freeze? - if (auth_pins + nested_auth_pins == 0) + if (auth_pins + nested_auth_pins == 0) on_freezeable(); + // nest? + if (is_subtree_root()) return; // no. + //assert(!is_import()); + inode->nested_auth_pins--; if (inode->parent) inode->parent->dir->adjust_nested_auth_pins( -1 ); @@ -619,25 +1112,23 @@ void CDir::adjust_nested_auth_pins(int inc) { CDir *dir = this; - while (1) { - // dir - dir->nested_auth_pins += inc; - - dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; - assert(dir->nested_auth_pins >= 0); - - // pending freeze? - if (dir->auth_pins + dir->nested_auth_pins == 0) - dir->on_freezeable(); - - // it's inode - dir->inode->nested_auth_pins += inc; - - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - break; - } + // dir + dir->nested_auth_pins += inc; + + dout(10) << "adjust_nested_auth_pins " << inc << " on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; + assert(dir->nested_auth_pins >= 0); + + // pending freeze? + if (is_freezeable()) + dir->on_freezeable(); + // on freezeable_dir too? FIXME + + // adjust my inode? + if (dir->is_subtree_root()) + return; // no, stop. + + // yes. + dir->inode->adjust_nested_auth_pins(inc); } @@ -650,12 +1141,15 @@ void CDir::on_freezeable() { // check for anything pending freezeable + /* NOTE: this will be called on deeper dirs first, walking up toward + the root, meaning that deeper freeze attempts will succeed first. + */ /* NOTE: the first of these will likely freeze the dir, and unmark FREEZING. additional ones will re-flag FREEZING. this isn't particularly graceful, and might cause problems if the first one needs to know about other waiters.... FIXME? */ - finish_waiting(CDIR_WAIT_FREEZEABLE); + finish_waiting(WAIT_FREEZEABLE); } // FREEZE TREE @@ -680,20 +1174,13 @@ void CDir::freeze_tree(Context *c) if (is_freezeable()) { dout(10) << "freeze_tree " << *this << endl; - - state_set(CDIR_STATE_FROZENTREE); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - + _freeze_tree(c); } else { - state_set(CDIR_STATE_FREEZINGTREE); + state_set(STATE_FREEZINGTREE); dout(10) << "freeze_tree + wait " << *this << endl; // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); + add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); } } @@ -703,16 +1190,29 @@ void CDir::freeze_tree_finish(Context *c) if (!is_freezeable()) { // wait again! dout(10) << "freeze_tree_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGTREE); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); + state_set(STATE_FREEZINGTREE); + add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); return; } dout(10) << "freeze_tree_finish " << *this << endl; - state_set(CDIR_STATE_FROZENTREE); - state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context? + _freeze_tree(c); +} + +void CDir::_freeze_tree(Context *c) +{ + dout(10) << "_freeze_tree " << *this << endl; + + // there shouldn't be any conflicting auth_pins. + assert(is_freezeable_dir()); + + // twiddle state + state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? + state_set(STATE_FROZENTREE); - inode->auth_pin(); // auth_pin for duration of freeze + // auth_pin inode for duration of freeze, if we are not a subtree root. + if (is_auth() && !is_subtree_root()) + inode->auth_pin(); // continue to frozen land if (c) { @@ -725,22 +1225,23 @@ void CDir::unfreeze_tree() { dout(10) << "unfreeze_tree " << *this << endl; - if (state_test(CDIR_STATE_FROZENTREE)) { + if (state_test(STATE_FROZENTREE)) { // frozen. unfreeze. - state_clear(CDIR_STATE_FROZENTREE); + state_clear(STATE_FROZENTREE); // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); + if (is_auth() && !is_subtree_root()) + inode->auth_unpin(); // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); + finish_waiting(WAIT_UNFREEZE); } else { // freezing. stop it. - assert(state_test(CDIR_STATE_FREEZINGTREE)); - state_clear(CDIR_STATE_FREEZINGTREE); + assert(state_test(STATE_FREEZINGTREE)); + state_clear(STATE_FREEZINGTREE); // cancel freeze waiters - finish_waiting(CDIR_WAIT_FREEZEABLE, -1); + finish_waiting(WAIT_FREEZEABLE, -1); } } @@ -749,8 +1250,7 @@ bool CDir::is_freezing_tree() CDir *dir = this; while (1) { if (dir->is_freezing_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; + if (dir->is_subtree_root()) return false; if (dir->inode->parent) dir = dir->inode->parent->dir; else @@ -763,9 +1263,7 @@ bool CDir::is_frozen_tree() CDir *dir = this; while (1) { if (dir->is_frozen_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; - if (dir->is_frozen_tree_leaf()) return false; + if (dir->is_subtree_root()) return false; if (dir->inode->parent) dir = dir->inode->parent->dir; else @@ -773,6 +1271,20 @@ bool CDir::is_frozen_tree() } } +CDir *CDir::get_frozen_tree_root() +{ + assert(is_frozen()); + CDir *dir = this; + while (1) { + if (dir->is_frozen_tree_root()) + return dir; + if (dir->inode->parent) + dir = dir->inode->parent->dir; + else + assert(0); + } +} + // FREEZE DIR @@ -797,57 +1309,56 @@ void CDir::freeze_dir(Context *c) if (is_freezeable_dir()) { dout(10) << "freeze_dir " << *this << endl; - - state_set(CDIR_STATE_FROZENDIR); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - + _freeze_dir(c); } else { - state_set(CDIR_STATE_FREEZINGDIR); + state_set(STATE_FREEZINGDIR); dout(10) << "freeze_dir + wait " << *this << endl; // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); + add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); } } -void CDir::freeze_dir_finish(Context *c) -{ - // freezeable now? - if (!is_freezeable_dir()) { - // wait again! - dout(10) << "freeze_dir_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGDIR); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - return; - } +void CDir::_freeze_dir(Context *c) +{ + dout(10) << "_freeze_dir " << *this << endl; - dout(10) << "freeze_dir_finish " << *this << endl; - state_set(CDIR_STATE_FROZENDIR); - state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context? - - inode->auth_pin(); // auth_pin for duration of freeze + state_set(STATE_FROZENDIR); + + if (is_auth() && !is_subtree_root()) + inode->auth_pin(); // auth_pin for duration of freeze - // continue to frozen land if (c) { c->finish(0); delete c; } } +void CDir::freeze_dir_finish(Context *c) +{ + // freezeable now? + if (is_freezeable_dir()) { + // freeze now + _freeze_dir(c); + } else { + // wait again! + dout(10) << "freeze_dir_finish still waiting " << *this << endl; + state_set(STATE_FREEZINGDIR); + add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); + } +} + void CDir::unfreeze_dir() { dout(10) << "unfreeze_dir " << *this << endl; - state_clear(CDIR_STATE_FROZENDIR); + state_clear(STATE_FROZENDIR); // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); + if (is_auth() && !is_subtree_root()) + inode->auth_unpin(); // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); + finish_waiting(WAIT_UNFREEZE); } @@ -857,34 +1368,3 @@ void CDir::unfreeze_dir() - -// ----------------------------------------------------------------- -// debug shite - - -void CDir::dump(int depth) { - string ind(depth, '\t'); - - dout(10) << "dump:" << ind << *this << endl; - - map::iterator iter = items.begin(); - while (iter != items.end()) { - CDentry* d = iter->second; - if (d->inode) { - char isdir = ' '; - if (d->inode->dir != NULL) isdir = '/'; - dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl; - d->inode->dump(depth+1); - } else { - dout(10) << "dump: " << ind << *d << " = [null]" << endl; - } - iter++; - } - - if (!(state_test(CDIR_STATE_COMPLETE))) - dout(10) << ind << "..." << endl; - if (state_test(CDIR_STATE_DIRTY)) - dout(10) << ind << "[dirty]" << endl; - -} - diff --git a/trunk/ceph/mds/CDir.h b/trunk/ceph/mds/CDir.h index 6283bef7c0aff..7d538d2e8a5be 100644 --- a/trunk/ceph/mds/CDir.h +++ b/trunk/ceph/mds/CDir.h @@ -41,125 +41,9 @@ class CDentry; class MDCache; class MDCluster; class Context; +class CDirDiscover; -// directory authority types -// >= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 - - -#define CDIR_NONCE_EXPORT 1 - - -// state bits -#define CDIR_STATE_AUTH (1<<0) // auth for this dir (hashing doesn't count) -#define CDIR_STATE_PROXY (1<<1) // proxy auth - -#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache -#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit - -#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) -#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing -#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import) -#define CDIR_STATE_FROZENDIR (1<<7) -#define CDIR_STATE_FREEZINGDIR (1<<8) - -#define CDIR_STATE_COMMITTING (1<<9) // mid-commit -#define CDIR_STATE_FETCHING (1<<10) // currenting fetching - -#define CDIR_STATE_DELETED (1<<11) - -#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import. -#define CDIR_STATE_EXPORT (1<<13) -#define CDIR_STATE_IMPORTINGEXPORT (1<<14) - -#define CDIR_STATE_HASHED (1<<15) // if hashed -#define CDIR_STATE_HASHING (1<<16) -#define CDIR_STATE_UNHASHING (1<<17) - - - - - -// these state bits are preserved by an import/export -// ...except if the directory is hashed, in which case none of them are! -#define CDIR_MASK_STATE_EXPORTED (CDIR_STATE_COMPLETE\ - |CDIR_STATE_DIRTY) -#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_IMPORTINGEXPORT\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_PROXY) - -#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_FROZENDIR\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_PROXY) - -// common states -#define CDIR_STATE_CLEAN 0 -#define CDIR_STATE_INITIAL 0 - -// directory replication -#define CDIR_REP_ALL 1 -#define CDIR_REP_NONE 0 -#define CDIR_REP_LIST 2 - - - - - - -// wait reasons -#define CDIR_WAIT_DENTRY 1 // wait for item to be in cache - // waiters: path_traverse - // trigger: handle_discover, fetch_dir_2 -#define CDIR_WAIT_COMPLETE 2 // wait for complete dir contents - // waiters: fetch_dir, commit_dir - // trigger: fetch_dir_2 -#define CDIR_WAIT_FREEZEABLE 4 // hard_pins removed - // waiters: freeze, freeze_finish - // trigger: auth_unpin, adjust_nested_auth_pins -#define CDIR_WAIT_UNFREEZE 8 // unfreeze - // waiters: path_traverse, handle_discover, handle_inode_update, - // export_dir_frozen (mdcache) - // handle_client_readdir (mds) - // trigger: unfreeze -#define CDIR_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: commit_dir (mdstore) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CDIR_WAIT_COMMITTED 32 // did commit (who uses this?**) - // waiters: commit_dir (if already committing) - // trigger: commit_dir_2 -#define CDIR_WAIT_IMPORTED 64 // import finish - // waiters: import_dir_block - // triggers: handle_export_dir_finish - -#define CDIR_WAIT_EXPORTWARNING 8192 // on bystander. - // watiers: handle_export_dir_notify - // triggers: handle_export_dir_warning -#define CDIR_WAIT_EXPORTPREPACK 16384 - // waiter export_dir - // trigger handel_export_dir_prep_ack - -#define CDIR_WAIT_HASHED (1<<17) // hash finish -#define CDIR_WAIT_THISHASHEDREADDIR (1<<18) // current readdir lock -#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19) // after current readdir lock finishes - -#define CDIR_WAIT_DNREAD (1<<20) -#define CDIR_WAIT_DNLOCK (1<<21) -#define CDIR_WAIT_DNUNPINNED (1<<22) -#define CDIR_WAIT_DNPINNABLE (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED) - -#define CDIR_WAIT_DNREQXLOCK (1<<23) - -#define CDIR_WAIT_ANY (0xffffffff) - -#define CDIR_WAIT_ATFREEZEROOT (CDIR_WAIT_AUTHPINNABLE|\ - CDIR_WAIT_UNFREEZE) // hmm, same same - ostream& operator<<(ostream& out, class CDir& dir); @@ -174,88 +58,127 @@ typedef map CDir_map_t; class CDir : public MDSCacheObject { public: // -- pins -- - static const int PIN_CHILD = 0; - static const int PIN_OPENED = 1; // open by another node - static const int PIN_WAITER = 2; // waiter(s) - static const int PIN_IMPORT = 3; + static const int PIN_DNWAITER = 1; + static const int PIN_CHILD = 2; static const int PIN_EXPORT = 4; - //static const int PIN_FREEZE = 5; - static const int PIN_FREEZELEAF = 6; - static const int PIN_PROXY = 7; // auth just changed. static const int PIN_AUTHPIN = 8; static const int PIN_IMPORTING = 9; - static const int PIN_IMPORTINGEXPORT = 10; - static const int PIN_HASHED = 11; - static const int PIN_HASHING = 12; - static const int PIN_DIRTY = 13; - static const int PIN_REQUEST = 14; - static const char *pin_name(int p) { + static const int PIN_EXPORTING = 10; + static const int PIN_IMPORTBOUND = 11; + static const int PIN_EXPORTBOUND = 12; + static const int PIN_LOGGINGEXPORTFINISH = 17; + const char *pin_name(int p) { switch (p) { + case PIN_DNWAITER: return "dnwaiter"; case PIN_CHILD: return "child"; - case PIN_OPENED: return "opened"; - case PIN_WAITER: return "waiter"; - case PIN_IMPORT: return "import"; case PIN_EXPORT: return "export"; - //case PIN_FREEZE: return "freeze"; - case PIN_FREEZELEAF: return "freezeleaf"; - case PIN_PROXY: return "proxy"; - case PIN_AUTHPIN: return "authpin"; + case PIN_EXPORTING: return "exporting"; case PIN_IMPORTING: return "importing"; - case PIN_IMPORTINGEXPORT: return "importingexport"; - case PIN_HASHED: return "hashed"; - case PIN_HASHING: return "hashing"; - case PIN_DIRTY: return "dirty"; - case PIN_REQUEST: return "request"; - default: assert(0); + case PIN_IMPORTBOUND: return "importbound"; + case PIN_EXPORTBOUND: return "exportbound"; + case PIN_AUTHPIN: return "authpin"; + case PIN_LOGGINGEXPORTFINISH: return "loggingexportfinish"; + default: return generic_pin_name(p); } } + // -- state -- + static const unsigned STATE_COMPLETE = (1<< 2); // the complete contents are in cache + static const unsigned STATE_FROZENTREE = (1<< 4); // root of tree (bounded by exports) + static const unsigned STATE_FREEZINGTREE = (1<< 5); // in process of freezing + static const unsigned STATE_FROZENDIR = (1<< 6); + static const unsigned STATE_FREEZINGDIR = (1<< 7); + static const unsigned STATE_COMMITTING = (1<< 8); // mid-commit + static const unsigned STATE_FETCHING = (1<< 9); // currenting fetching + static const unsigned STATE_DELETED = (1<<10); + //static const unsigned STATE_IMPORT = (1<<11); // flag set if this is an import. + static const unsigned STATE_EXPORT = (1<<12); + static const unsigned STATE_IMPORTBOUND = (1<<13); + static const unsigned STATE_EXPORTBOUND = (1<<14); + static const unsigned STATE_EXPORTING = (1<<15); + static const unsigned STATE_IMPORTING = (1<<16); + + // common states + static const unsigned STATE_CLEAN = 0; + static const unsigned STATE_INITIAL = 0; + + // these state bits are preserved by an import/export + // ...except if the directory is hashed, in which case none of them are! + static const unsigned MASK_STATE_EXPORTED = + STATE_COMPLETE|STATE_DIRTY; + static const unsigned MASK_STATE_IMPORT_KEPT = + //STATE_IMPORT| + STATE_EXPORT + |STATE_IMPORTING + |STATE_IMPORTBOUND|STATE_EXPORTBOUND + |STATE_FROZENTREE; + static const unsigned MASK_STATE_EXPORT_KEPT = + STATE_EXPORTING + |STATE_IMPORTBOUND|STATE_EXPORTBOUND + |STATE_FROZENTREE + |STATE_FROZENDIR + |STATE_EXPORT; + + + // -- rep spec -- + static const int REP_NONE = 0; + static const int REP_ALL = 1; + static const int REP_LIST = 2; + + + static const int NONCE_EXPORT = 1; + + + // -- wait masks -- + static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache + static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents + static const int WAIT_FREEZEABLE = (1<<2); // hard_pins removed + static const int WAIT_UNFREEZE = (1<<3); // unfreeze + static const int WAIT_AUTHPINNABLE = WAIT_UNFREEZE; + static const int WAIT_IMPORTED = (1<<4); // import finish + //static const int WAIT_SINGLEAUTH = (1<<5); + + static const int WAIT_DNLOCK_OFFSET = 6; + + static const int WAIT_ANY = (0xffffffff); + static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE); + static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); + + + public: // context MDCache *cache; - // my inode - CInode *inode; + CInode *inode; // my inode + frag_t frag; // my frag + + bool is_lt(const MDSCacheObject *r) const { + return dirfrag() < ((const CDir*)r)->dirfrag(); + } protected: // contents CDir_map_t items; // non-null AND null - CDir_map_t null_items; // null and foreign - size_t nitems; // non-null - size_t nnull; // null + size_t nitems; // # non-null + size_t nnull; // # null // state version_t version; version_t committing_version; - version_t last_committed_version; // slight lie; we bump this on import. + version_t committed_version; version_t projected_version; - // authority, replicas - int dir_auth; - // lock nesting, freeze int auth_pins; int nested_auth_pins; int request_pins; - // hashed dirs - set hashed_subset; // HASHING: subset of mds's that are hashed - public: - // for class MDS - map, list > > hashed_readdir; - protected: - - - - // waiters - multimap waiting; // tag -> context - hash_map< string, multimap > - waiting_on_dentry; // cache control (defined for authority; hints for replicas) int dir_rep; - set dir_rep_by; // if dir_rep == CDIR_REP_LIST + set dir_rep_by; // if dir_rep == REP_LIST // popularity meta_load_t popularity[MDS_NPOP]; @@ -271,12 +194,14 @@ class CDir : public MDSCacheObject { friend class CDirExport; public: - CDir(CInode *in, MDCache *mdcache, bool auth); + CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); // -- accessors -- - inodeno_t ino() { return inode->ino(); } + inodeno_t ino() const { return inode->ino(); } // deprecate me? + dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } + CInode *get_inode() { return inode; } CDir *get_parent_dir() { return inode->get_parent_dir(); } @@ -315,20 +240,42 @@ class CDir : public MDSCacheObject { void link_inode_work( CDentry *dn, CInode *in ); void unlink_inode_work( CDentry *dn ); - void remove_null_dentries(); // on empty, clean dir + void remove_null_dentries(); // -- authority -- + /* + * normal: !subtree_root + * delegation: subtree_root + * ambiguous: subtree_root + * subtree_root + */ + pair dir_auth; + public: - int authority(); - int dentry_authority(const string& d); - int get_dir_auth() { return dir_auth; } - void set_dir_auth(int d); + pair authority(); + pair get_dir_auth() { return dir_auth; } + void set_dir_auth(pair a, bool iamauth=false); + void set_dir_auth(int a) { + set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN), false); + } + bool is_ambiguous_dir_auth() { + return dir_auth.second != CDIR_AUTH_UNKNOWN; + } + bool is_full_dir_auth() { + return is_auth() && !is_ambiguous_dir_auth(); + } + bool is_full_dir_nonauth() { + return !is_auth() && !is_ambiguous_dir_auth(); + } + + bool is_subtree_root(); // for giving to clients void get_dist_spec(set& ls, int auth) { - if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) { + if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > + g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; for (map::iterator p = replicas_begin(); p != replicas_end(); @@ -339,46 +286,46 @@ class CDir : public MDSCacheObject { } } + CDirDiscover *replicate_to(int mds); - // -- state -- - bool is_complete() { return state & CDIR_STATE_COMPLETE; } - bool is_dirty() { return state_test(CDIR_STATE_DIRTY); } - - bool is_auth() { return state & CDIR_STATE_AUTH; } - bool is_proxy() { return state & CDIR_STATE_PROXY; } - bool is_import() { return state & CDIR_STATE_IMPORT; } - bool is_export() { return state & CDIR_STATE_EXPORT; } - bool is_hashed() { return state & CDIR_STATE_HASHED; } - bool is_hashing() { return state & CDIR_STATE_HASHING; } - bool is_unhashing() { return state & CDIR_STATE_UNHASHING; } + // -- state -- + bool is_complete() { return state & STATE_COMPLETE; } + bool is_exporting() { return state & STATE_EXPORTING; } + bool is_importing() { return state & STATE_IMPORTING; } bool is_rep() { - if (dir_rep == CDIR_REP_NONE) return false; + if (dir_rep == REP_NONE) return false; return true; } + // -- fetch -- + object_t get_ondisk_object() { return object_t(ino(), frag); } + void fetch(Context *c); + void _fetched(bufferlist &bl); + + // -- commit -- + map > waiting_for_commit; + void commit_to(version_t want); + void commit(version_t want, Context *c); + void _commit(version_t want); + void _committed(version_t v); + void wait_for_commit(Context *c, version_t v=0); // -- dirtyness -- version_t get_version() { return version; } void set_version(version_t v) { projected_version = version = v; } version_t get_projected_version() { return projected_version; } - version_t get_committing_version() { return committing_version; } - version_t get_last_committed_version() { return last_committed_version; } - // as in, we're committing the current version. - void set_committing_version() { committing_version = version; } - void set_last_committed_version(version_t v) { last_committed_version = v; } + version_t get_committed_version() { return committed_version; } + void set_committed_version(version_t v) { committed_version = v; } - version_t pre_dirty(); + version_t pre_dirty(version_t min=0); void _mark_dirty(); void mark_dirty(version_t pv); void mark_clean(); - void mark_complete() { state_set(CDIR_STATE_COMPLETE); } - bool is_clean() { return !state_test(CDIR_STATE_DIRTY); } - - + void mark_complete() { state_set(STATE_COMPLETE); } // -- reference counting -- @@ -396,24 +343,27 @@ class CDir : public MDSCacheObject { // -- waiters -- - bool waiting_for(int tag); - bool waiting_for(int tag, const string& dn); - void add_waiter(int tag, Context *c); - void add_waiter(int tag, - const string& dentry, - Context *c); - void take_waiting(int mask, list& ls); // includes dentry waiters - void take_waiting(int mask, - const string& dentry, - list& ls, - int num=0); - void finish_waiting(int mask, int result = 0); // ditto - void finish_waiting(int mask, const string& dn, int result = 0); // ditto +protected: + hash_map< string, list > waiting_on_dentry; +public: + bool is_waiting_for_dentry(const string& dn) { + return waiting_on_dentry.count(dn); + } + void add_dentry_waiter(const string& dentry, Context *c); + void take_dentry_waiting(const string& dentry, list& ls); + + void add_waiter(int mask, Context *c); + void take_waiting(int mask, list& ls); // may include dentry waiters + void finish_waiting(int mask, int result = 0); // ditto + // -- auth pins -- - bool can_auth_pin() { return !(is_frozen() || is_freezing()); } + bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } int is_auth_pinned() { return auth_pins; } + int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } + int get_auth_pins() { return auth_pins; } + int get_nested_auth_pins() { return nested_auth_pins; } void auth_pin(); void auth_unpin(); void adjust_nested_auth_pins(int inc); @@ -423,35 +373,50 @@ class CDir : public MDSCacheObject { void freeze_tree(Context *c); void freeze_tree_finish(Context *c); void unfreeze_tree(); + void _freeze_tree(Context *c=0); void freeze_dir(Context *c); void freeze_dir_finish(Context *c); + void _freeze_dir(Context *c=0); void unfreeze_dir(); bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; } + bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } + bool is_freezing_dir() { return state & STATE_FREEZINGDIR; } bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; } - bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; } - bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; } + bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } + bool is_frozen_dir() { return state & STATE_FROZENDIR; } bool is_freezeable() { - if (auth_pins == 0 && nested_auth_pins == 0) return true; - return false; + // no nested auth pins. + if (auth_pins > 0 || nested_auth_pins > 0) + return false; + + // inode must not be frozen. + if (!is_subtree_root() && inode->is_frozen()) + return false; + + return true; } bool is_freezeable_dir() { - if (auth_pins == 0) return true; - return false; + if (auth_pins > 0) + return false; + + // if not subtree root, inode must not be frozen. + if (!is_subtree_root() && inode->is_frozen()) + return false; + + return true; } + CDir *get_frozen_tree_root(); - // debuggin bs - void dump(int d = 0); + + void print(ostream& out); }; @@ -461,50 +426,44 @@ class CDir : public MDSCacheObject { // discover class CDirDiscover { - inodeno_t ino; + dirfrag_t dirfrag; int nonce; - int dir_auth; int dir_rep; set rep_by; public: CDirDiscover() {} CDirDiscover(CDir *dir, int nonce) { - ino = dir->ino(); + dirfrag = dir->dirfrag(); this->nonce = nonce; - dir_auth = dir->dir_auth; dir_rep = dir->dir_rep; rep_by = dir->dir_rep_by; } void update_dir(CDir *dir) { - assert(dir->ino() == ino); + assert(dir->dirfrag() == dirfrag); assert(!dir->is_auth()); dir->replica_nonce = nonce; - dir->dir_auth = dir_auth; dir->dir_rep = dir_rep; dir->dir_rep_by = rep_by; } - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } void _encode(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&dirfrag, sizeof(dirfrag)); bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); bl.append((char*)&dir_rep, sizeof(dir_rep)); ::_encode(rep_by, bl); } void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); bl.copy(off, sizeof(nonce), (char*)&nonce); off += sizeof(nonce); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); off += sizeof(dir_rep); ::_decode(rep_by, bl, off); @@ -517,10 +476,10 @@ class CDirDiscover { class CDirExport { struct { - inodeno_t ino; - long nitems; // actual real entries + dirfrag_t dirfrag; long nden; // num dentries (including null ones) version_t version; + version_t committed_version; unsigned state; meta_load_t popularity_justme; meta_load_t popularity_curdom; @@ -536,10 +495,10 @@ class CDirExport { assert(dir->get_version() == dir->get_projected_version()); - st.ino = dir->ino(); - st.nitems = dir->nitems; + st.dirfrag = dir->dirfrag(); st.nden = dir->items.size(); st.version = dir->version; + st.committed_version = dir->committed_version; st.state = dir->state; st.dir_rep = dir->dir_rep; @@ -552,24 +511,19 @@ class CDirExport { replicas = dir->replicas; } - inodeno_t get_ino() { return st.ino; } + dirfrag_t get_dirfrag() { return st.dirfrag; } __uint64_t get_nden() { return st.nden; } void update_dir(CDir *dir) { - assert(dir->ino() == st.ino); - - //dir->nitems = st.nitems; + assert(dir->dirfrag() == st.dirfrag); - // set last_committed_version at old version - dir->committing_version = dir->last_committed_version = st.version; - dir->projected_version = dir->version = st.version; // this is bumped, below, if dirty + // set committed_version at old version + dir->committing_version = dir->committed_version = st.committed_version; + dir->projected_version = dir->version = st.version; // twiddle state - if (dir->state & CDIR_STATE_HASHED) - dir->state_set( CDIR_STATE_AUTH ); // just inherit auth flag when hashed - else - dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (st.state & CDIR_MASK_STATE_EXPORTED); + dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. + (st.state & CDir::MASK_STATE_EXPORTED); dir->dir_rep = st.dir_rep; dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; @@ -586,12 +540,9 @@ class CDirExport { dir->replicas = replicas; dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl; if (!replicas.empty()) - dir->get(CDir::PIN_OPENED); + dir->get(CDir::PIN_REPLICATED); if (dir->is_dirty()) { dir->get(CDir::PIN_DIRTY); - - // bump dir version + 1 if dirty - dir->projected_version = dir->version = st.version + 1; } } diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc index f431184fb199b..79555575482c9 100644 --- a/trunk/ceph/mds/CInode.cc +++ b/trunk/ceph/mds/CInode.cc @@ -23,7 +23,10 @@ #include "common/Clock.h" +#include "messages/MLock.h" + #include +#include #include "config.h" #undef dout @@ -31,6 +34,11 @@ //int cinode_pins[CINODE_NUM_PINS]; // counts +ostream& CInode::print_db_line_prefix(ostream& out) +{ + return out << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; +} + ostream& operator<<(ostream& out, CInode& in) @@ -52,15 +60,15 @@ ostream& operator<<(ostream& out, CInode& in) out << " v" << in.get_version(); - out << " hard=" << in.hardlock; + out << " auth=" << in.authlock; + out << " link=" << in.linklock; + out << " dft=" << in.dirfragtreelock; out << " file=" << in.filelock; - + out << " dir=" << in.dirlock; + if (in.get_num_ref()) { out << " |"; - for(set::iterator it = in.get_ref_set().begin(); - it != in.get_ref_set().end(); - it++) - out << " " << CInode::pin_name(*it); + in.print_pin_set(out); } // hack: spit out crap on which clients have caps @@ -80,31 +88,52 @@ ostream& operator<<(ostream& out, CInode& in) } -// ====== CInode ======= -CInode::CInode(MDCache *c, bool auth) { - mdcache = c; +void CInode::print(ostream& out) +{ + out << *this; +} - ref = 0; - - num_parents = 0; - parent = NULL; - - dir = NULL; // CDir opened separately - auth_pins = 0; - nested_auth_pins = 0; - num_request_pins = 0; +// ====== CInode ======= + +// dirfrags - state = 0; +frag_t CInode::pick_dirfrag(const string& dn) +{ + if (dirfragtree.empty()) + return frag_t(); // avoid the string hash if we can. - if (auth) state_set(STATE_AUTH); + static hash H; + return dirfragtree[H(dn)]; } -CInode::~CInode() { - if (dir) { delete dir; dir = 0; } +void CInode::get_dirfrags(list& ls) +{ + // all dirfrags + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + ls.push_back(p->second); +} +void CInode::get_nested_dirfrags(list& ls) +{ + // dirfrags in same subtree + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + if (!p->second->is_subtree_root()) + ls.push_back(p->second); +} +void CInode::get_subtree_dirfrags(list& ls) +{ + // dirfrags that are roots of new subtrees + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + if (p->second->is_subtree_root()) + ls.push_back(p->second); } - // pins void CInode::first_get() @@ -120,10 +149,11 @@ void CInode::last_put() if (parent) { parent->put(CDentry::PIN_INODEPIN); } - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection + //if (num_parents == 0 && get_num_ref() == 0) + //mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection } +/* void CInode::get_parent() { num_parents++; @@ -134,6 +164,21 @@ void CInode::put_parent() if (num_parents == 0 && get_num_ref() == 0) mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection } +*/ + +void CInode::add_remote_parent(CDentry *p) +{ + if (remote_parents.empty()) + get(PIN_REMOTEPARENT); + remote_parents.insert(p); +} +void CInode::remove_remote_parent(CDentry *p) +{ + remote_parents.erase(p); + if (remote_parents.empty()) + put(PIN_REMOTEPARENT); +} + @@ -150,53 +195,42 @@ CInode *CInode::get_parent_inode() return NULL; } -bool CInode::dir_is_auth() { - if (dir) - return dir->is_auth(); - else - return is_auth(); -} - -CDir *CInode::get_or_open_dir(MDCache *mdcache) +CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) { assert(is_dir()); + // have it? + CDir *dir = get_dirfrag(fg); if (dir) return dir; - - // can't open a dir if we're frozen_dir, bc of hashing stuff. - assert(!is_frozen_dir()); - - // only auth can open dir alone. + + // create it. assert(is_auth()); - set_dir( new CDir(this, mdcache, true) ); - dir->dir_auth = -1; + dir = dirfrags[fg] = new CDir(this, fg, mdcache, true); return dir; } -CDir *CInode::set_dir(CDir *newdir) +CDir *CInode::add_dirfrag(CDir *dir) { - assert(dir == 0); - dir = newdir; + assert(dirfrags.count(dir->dirfrag().frag) == 0); + dirfrags[dir->dirfrag().frag] = dir; return dir; } -void CInode::close_dir() +void CInode::close_dirfrag(frag_t fg) { - assert(dir); - assert(dir->get_num_ref() == 0); - delete dir; - dir = 0; + assert(dirfrags.count(fg)); + + dirfrags[fg]->remove_null_dentries(); + + assert(dirfrags[fg]->get_num_ref() == 0); + delete dirfrags[fg]; + dirfrags.erase(fg); } - -void CInode::set_auth(bool a) +void CInode::close_dirfrags() { - if (!is_dangling() && !is_root() && - is_auth() != a) { - } - - if (a) state_set(STATE_AUTH); - else state_clear(STATE_AUTH); + while (!dirfrags.empty()) + close_dirfrag(dirfrags.begin()->first); } @@ -209,33 +243,31 @@ void CInode::make_path(string& s) else if (is_root()) { s = ""; // root } + else if (is_stray()) { + s = "~"; + } else { s = "(dangling)"; // dangling } } -void CInode::make_anchor_trace(vector& trace) +void CInode::make_anchor_trace(vector& trace) { if (parent) { parent->dir->inode->make_anchor_trace(trace); - - dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl; - trace.push_back( new Anchor(ino(), - parent->dir->inode->ino(), - parent->name) ); - } - else if (state_test(STATE_DANGLING)) { - dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl; - string ref_dn; - trace.push_back( new Anchor(ino(), - MDS_INO_INODEFILE_OFFSET+dangling_auth, - ref_dn) ); + trace.push_back(Anchor(ino(), parent->dir->dirfrag())); + dout(10) << "make_anchor_trace added " << trace.back() << endl; } else assert(is_root()); } - +void CInode::name_stray_dentry(string& dname) +{ + stringstream ss; + ss << inode.ino; + ss >> dname; +} version_t CInode::pre_dirty() @@ -277,6 +309,7 @@ void CInode::mark_dirty(version_t pv) { parent->mark_dirty(pv); } + void CInode::mark_clean() { dout(10) << " mark_clean " << *this << endl; @@ -286,69 +319,107 @@ void CInode::mark_clean() } } -// state +// ------------------ +// locking - - -// new state encoders - -void CInode::encode_file_state(bufferlist& bl) +void CInode::set_mlock_info(MLock *m) { - bl.append((char*)&inode.size, sizeof(inode.size)); - bl.append((char*)&inode.mtime, sizeof(inode.mtime)); - bl.append((char*)&inode.atime, sizeof(inode.atime)); // ?? + m->set_ino(ino()); } -void CInode::decode_file_state(bufferlist& r, int& off) +void CInode::encode_lock_state(int type, bufferlist& bl) { - r.copy(off, sizeof(inode.size), (char*)&inode.size); - off += sizeof(inode.size); - r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime); - off += sizeof(inode.mtime); - r.copy(off, sizeof(inode.atime), (char*)&inode.atime); - off += sizeof(inode.atime); + switch (type) { + case LOCK_OTYPE_IAUTH: + ::_encode(inode.ctime, bl); + ::_encode(inode.mode, bl); + ::_encode(inode.uid, bl); + ::_encode(inode.gid, bl); + break; + + case LOCK_OTYPE_ILINK: + ::_encode(inode.ctime, bl); + ::_encode(inode.nlink, bl); + ::_encode(inode.anchored, bl); + break; + + case LOCK_OTYPE_IDIRFRAGTREE: + dirfragtree._encode(bl); + break; + + case LOCK_OTYPE_IFILE: + ::_encode(inode.size, bl); + ::_encode(inode.mtime, bl); + ::_encode(inode.atime, bl); + break; + + case LOCK_OTYPE_IDIR: + ::_encode(inode.mtime, bl); + { + map dfsz; + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + if (p->second->is_auth()) + dfsz[p->first] = p->second->get_nitems(); + ::_encode(dfsz, bl); + } + break; + + default: + assert(0); + } } -/* not used currently -void CInode::decode_merge_file_state(crope& r, int& off) +void CInode::decode_lock_state(int type, bufferlist& bl) { - __uint64_t size; - r.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - if (size > inode.size) inode.size = size; - - time_t t; - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.mtime) inode.mtime = t; - - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.atime) inode.atime = t; -} -*/ + int off = 0; + utime_t tm; + + switch (type) { + case LOCK_OTYPE_IAUTH: + ::_decode(tm, bl, off); + if (inode.ctime < tm) inode.ctime = tm; + ::_decode(inode.mode, bl, off); + ::_decode(inode.uid, bl, off); + ::_decode(inode.gid, bl, off); + break; + + case LOCK_OTYPE_ILINK: + ::_decode(tm, bl, off); + if (inode.ctime < tm) inode.ctime = tm; + ::_decode(inode.nlink, bl, off); + ::_decode(inode.anchored, bl, off); + break; + + case LOCK_OTYPE_IDIRFRAGTREE: + dirfragtree._decode(bl, off); + break; + + case LOCK_OTYPE_IFILE: + ::_decode(inode.size, bl, off); + ::_decode(inode.mtime, bl, off); + ::_decode(inode.atime, bl, off); + break; + + case LOCK_OTYPE_IDIR: + //::_decode(inode.size, bl, off); + ::_decode(tm, bl, off); + if (inode.mtime < tm) inode.mtime = tm; + { + map dfsz; + ::_decode(dfsz, bl, off); + // hmm which to keep? + } + break; -void CInode::encode_hard_state(bufferlist& r) -{ - r.append((char*)&inode.mode, sizeof(inode.mode)); - r.append((char*)&inode.uid, sizeof(inode.uid)); - r.append((char*)&inode.gid, sizeof(inode.gid)); - r.append((char*)&inode.ctime, sizeof(inode.ctime)); + default: + assert(0); + } } -void CInode::decode_hard_state(bufferlist& r, int& off) -{ - r.copy(off, sizeof(inode.mode), (char*)&inode.mode); - off += sizeof(inode.mode); - r.copy(off, sizeof(inode.uid), (char*)&inode.uid); - off += sizeof(inode.uid); - r.copy(off, sizeof(inode.gid), (char*)&inode.gid); - off += sizeof(inode.gid); - r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime); - off += sizeof(inode.ctime); -} @@ -375,54 +446,18 @@ bool CInode::is_freezing() return false; } -bool CInode::waiting_for(int tag) +void CInode::add_waiter(int tag, Context *c) { - return waiting.count(tag) > 0; -} - -void CInode::add_waiter(int tag, Context *c) { - // waiting on hierarchy? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - parent->dir->add_waiter(tag, c); + // wait on the directory? + if (tag & WAIT_AUTHPINNABLE) { + parent->dir->add_waiter(CDir::WAIT_AUTHPINNABLE, c); return; } - - // this inode. - if (waiting.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; - -} - -void CInode::take_waiting(int mask, list& ls) -{ - if (waiting.empty()) return; - - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; - } + if (tag & WAIT_SINGLEAUTH) { + parent->dir->add_waiter(CDir::WAIT_SINGLEAUTH, c); + return; } - - if (waiting.empty()) - put(PIN_WAITER); -} - -void CInode::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); + MDSCacheObject::add_waiter(tag, c); } @@ -433,49 +468,52 @@ bool CInode::can_auth_pin() { return true; } -void CInode::auth_pin() { +void CInode::auth_pin() +{ if (auth_pins == 0) get(PIN_AUTHPIN); auth_pins++; dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - + if (parent) parent->dir->adjust_nested_auth_pins( 1 ); } -void CInode::auth_unpin() { +void CInode::auth_unpin() +{ auth_pins--; if (auth_pins == 0) put(PIN_AUTHPIN); - + dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - + assert(auth_pins >= 0); - + if (parent) parent->dir->adjust_nested_auth_pins( -1 ); } +void CInode::adjust_nested_auth_pins(int a) +{ + if (!parent) return; + nested_auth_pins += a; + parent->get_dir()->adjust_nested_auth_pins(a); +} + // authority -int CInode::authority() { - if (is_dangling()) - return dangling_auth; // explicit - - if (is_root()) { // i am root - if (dir) - return dir->get_dir_auth(); // bit of a chicken/egg issue here! - else - return CDIR_AUTH_UNKNOWN; - } +pair CInode::authority() +{ + if (is_root()) + return CDIR_AUTH_ROOTINODE; // root _inode_ is locked to mds0. if (parent) - return parent->dir->dentry_authority( parent->name ); + return parent->dir->authority(); - return -1; // undefined (inode must not be linked yet!) + return CDIR_AUTH_UNDEF; } @@ -493,14 +531,4 @@ CInodeDiscover* CInode::replicate_to( int rep ) } -// debug crap ----------------------------- - -void CInode::dump(int dep) -{ - string ind(dep, '\t'); - //cout << ind << "[inode " << this << "]" << endl; - - if (dir) - dir->dump(dep); -} diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h index d2292196a5ebc..e743e5e0d581e 100644 --- a/trunk/ceph/mds/CInode.h +++ b/trunk/ceph/mds/CInode.h @@ -23,7 +23,9 @@ #include "mdstypes.h" #include "CDentry.h" -#include "Lock.h" +#include "SimpleLock.h" +#include "FileLock.h" +#include "ScatterLock.h" #include "Capability.h" @@ -35,51 +37,6 @@ #include using namespace std; - -// wait reasons -#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: write_hard_start, read_file_start, write_file_start (mdcache) - // handle_client_chmod, handle_client_touch (mds) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode - // waiters: import_dentry_inode - // trigger: handle_inode_replicate_ack - -#define CINODE_WAIT_DIR (1<<13) - // waiters: traverse_path - // triggers: handle_disocver_reply - -#define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++ -#define CINODE_WAIT_ANCHORED (1<<15) -#define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink-- - -#define CINODE_WAIT_HARDR (1<<17) // 131072 -#define CINODE_WAIT_HARDW (1<<18) // 262... -#define CINODE_WAIT_HARDB (1<<19) -#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) -#define CINODE_WAIT_HARDSTABLE (1<<20) -#define CINODE_WAIT_HARDNORD (1<<21) -#define CINODE_WAIT_FILER (1<<22) -#define CINODE_WAIT_FILEW (1<<23) -#define CINODE_WAIT_FILEB (1<<24) -#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) -#define CINODE_WAIT_FILESTABLE (1<<25) -#define CINODE_WAIT_FILENORD (1<<26) -#define CINODE_WAIT_FILENOWR (1<<27) - -#define CINODE_WAIT_RENAMEACK (1<<28) -#define CINODE_WAIT_RENAMENOTIFYACK (1<<29) - -#define CINODE_WAIT_CAPS (1<<30) - -#define CINODE_WAIT_ANY 0xffffffff - - - -// misc -#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export -#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME??? - class Context; class CDentry; class CDir; @@ -96,86 +53,97 @@ ostream& operator<<(ostream& out, CInode& in); class CInode : public MDSCacheObject { public: // -- pins -- - static const int PIN_CACHED = 1; + //static const int PIN_REPLICATED = 1; static const int PIN_DIR = 2; - static const int PIN_DIRTY = 4; // must flush - static const int PIN_PROXY = 5; // can't expire yet - static const int PIN_WAITER = 6; // waiter - static const int PIN_CAPS = 7; // local fh's + static const int PIN_CAPS = 7; // client caps static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; // multipurpose, for importing - static const int PIN_REQUEST = 10; // request is logging, finishing - static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename + static const int PIN_IMPORTING = -9; // importing static const int PIN_ANCHORING = 12; - - static const int PIN_OPENINGDIR = 13; - - static const int PIN_DENTRYLOCK = 14; + static const int PIN_UNANCHORING = 13; + static const int PIN_OPENINGDIR = 14; + static const int PIN_REMOTEPARENT = 15; + static const int PIN_BATCHOPENJOURNAL = 16; - static const char *pin_name(int p) { + const char *pin_name(int p) { switch (p) { - case PIN_CACHED: return "cached"; case PIN_DIR: return "dir"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - case PIN_WAITER: return "waiter"; case PIN_CAPS: return "caps"; case PIN_AUTHPIN: return "authpin"; case PIN_IMPORTING: return "importing"; - case PIN_REQUEST: return "request"; - case PIN_RENAMESRC: return "renamesrc"; case PIN_ANCHORING: return "anchoring"; + case PIN_UNANCHORING: return "unanchoring"; case PIN_OPENINGDIR: return "openingdir"; - case PIN_DENTRYLOCK: return "dentrylock"; - default: assert(0); + case PIN_REMOTEPARENT: return "remoteparent"; + case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; + default: return generic_pin_name(p); } } - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_ROOT = (1<<1); - static const int STATE_DIRTY = (1<<2); - static const int STATE_UNSAFE = (1<<3); // not logged yet - static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry - static const int STATE_UNLINKING = (1<<5); - static const int STATE_PROXY = (1<<6); // can't expire yet - static const int STATE_EXPORTING = (1<<7); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<8); + // -- state -- + static const int STATE_ROOT = (1<<2); + //static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry + static const int STATE_EXPORTING = (1<<6); // on nonauth bystander. + static const int STATE_ANCHORING = (1<<7); + static const int STATE_UNANCHORING = (1<<8); static const int STATE_OPENINGDIR = (1<<9); - //static const int STATE_RENAMING = (1<<8); // moving me - //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked) + // -- waiters -- + static const int WAIT_SLAVEAGREE = (1<<0); + static const int WAIT_AUTHPINNABLE = (1<<1); + static const int WAIT_DIR = (1<<2); + static const int WAIT_ANCHORED = (1<<3); + static const int WAIT_UNANCHORED = (1<<4); + static const int WAIT_CAPS = (1<<5); + + static const int WAIT_AUTHLOCK_OFFSET = 6; + static const int WAIT_LINKLOCK_OFFSET = 6 + SimpleLock::WAIT_BITS; + static const int WAIT_DIRFRAGTREELOCK_OFFSET = 6 + 2*SimpleLock::WAIT_BITS; + static const int WAIT_FILELOCK_OFFSET = 6 + 3*SimpleLock::WAIT_BITS; + static const int WAIT_DIRLOCK_OFFSET = 6 + 4*SimpleLock::WAIT_BITS; + + static const int WAIT_ANY = 0xffffffff; + // misc + static const int EXPORT_NONCE = 1; // nonce given to replicas created by export + ostream& print_db_line_prefix(ostream& out); public: MDCache *mdcache; - inode_t inode; // the inode itself + // inode contents proper + inode_t inode; // the inode itself + string symlink; // symlink dest, if symlink + fragtree_t dirfragtree; // dir frag tree, if any + map dirfrag_size; // size of each dirfrag - CDir *dir; // directory, if we have it opened. - string symlink; // symlink dest, if symlink + off_t last_open_journaled; // log offset for the last journaled EOpen + + // -- cache infrastructure -- + map dirfrags; // cached dir fragments + + frag_t pick_dirfrag(const string &dn); + CDir* get_dirfrag(frag_t fg) { + if (dirfrags.count(fg)) + return dirfrags[fg]; + else + return 0; + } + void get_dirfrags(list& ls); + void get_nested_dirfrags(list& ls); + void get_subtree_dirfrags(list& ls); + CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); + CDir *add_dirfrag(CDir *dir); + void close_dirfrag(frag_t fg); + void close_dirfrags(); protected: // parent dentries in cache - int num_parents; CDentry *parent; // primary link set remote_parents; // if hard linked - // -- distributed caching - int dangling_auth; // explicit auth, when dangling. - - int num_request_pins; - - // waiters - multimap waiting; - // -- distributed state -- -public: - // inode metadata locks - CLock hardlock; - CLock filelock; protected: // file capabilities map client_caps; // client -> caps @@ -185,7 +153,7 @@ protected: private: - // lock nesting + // auth pin int auth_pins; int nested_auth_pins; @@ -203,70 +171,61 @@ protected: public: // --------------------------- - CInode(MDCache *c, bool auth=true); - ~CInode(); + CInode(MDCache *c, bool auth=true) : + mdcache(c), + last_open_journaled(0), + parent(0), + replica_caps_wanted(0), + auth_pins(0), nested_auth_pins(0), + authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), + linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET), + dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET), + filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET), + dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET) + { + state = 0; + if (auth) state_set(STATE_AUTH); + }; + ~CInode() { + close_dirfrags(); + } // -- accessors -- - bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; } - bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; } - bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; } + bool is_file() { return inode.is_file(); } + bool is_symlink() { return inode.is_symlink(); } + bool is_dir() { return inode.is_dir(); } bool is_anchored() { return inode.anchored; } - + bool is_anchoring() { return state_test(STATE_ANCHORING); } + bool is_unanchoring() { return state_test(STATE_UNANCHORING); } + bool is_root() { return state & STATE_ROOT; } - bool is_proxy() { return state & STATE_PROXY; } + bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } - bool is_auth() { return state & STATE_AUTH; } - void set_auth(bool auth); - inodeno_t ino() { return inode.ino; } + inodeno_t ino() const { return inode.ino; } inode_t& get_inode() { return inode; } CDentry* get_parent_dn() { return parent; } CDir *get_parent_dir(); CInode *get_parent_inode(); - CInode *get_realm_root(); // import, hash, or root - - CDir *get_or_open_dir(MDCache *mdcache); - CDir *set_dir(CDir *newdir); - void close_dir(); - bool dir_is_auth(); + bool is_lt(const MDSCacheObject *r) const { + return ino() < ((CInode*)r)->ino(); + } // -- misc -- void make_path(string& s); - void make_anchor_trace(vector& trace); - - - - // -- state -- - bool is_unsafe() { return state & STATE_UNSAFE; } - bool is_dangling() { return state & STATE_DANGLING; } - bool is_unlinking() { return state & STATE_UNLINKING; } - - void mark_unsafe() { state |= STATE_UNSAFE; } - void mark_safe() { state &= ~STATE_UNSAFE; } + void make_anchor_trace(vector& trace); + void name_stray_dentry(string& dname); - // -- state encoding -- - //void encode_basic_state(bufferlist& r); - //void decode_basic_state(bufferlist& r, int& off); - - - void encode_file_state(bufferlist& r); - void decode_file_state(bufferlist& r, int& off); - - void encode_hard_state(bufferlist& r); - void decode_hard_state(bufferlist& r, int& off); // -- dirtyness -- version_t get_version() { return inode.version; } - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - version_t pre_dirty(); void _mark_dirty(); void mark_dirty(version_t projected_dirv); @@ -279,21 +238,35 @@ protected: // -- waiting -- - bool waiting_for(int tag); void add_waiter(int tag, Context *c); - void take_waiting(int tag, list& ls); - void finish_waiting(int mask, int result = 0); - bool is_hardlock_write_wanted() { - return waiting_for(CINODE_WAIT_HARDW); - } - bool is_filelock_write_wanted() { - return waiting_for(CINODE_WAIT_FILEW); + // -- locks -- +public: + SimpleLock authlock; + SimpleLock linklock; + SimpleLock dirfragtreelock; + FileLock filelock; + ScatterLock dirlock; + + SimpleLock* get_lock(int type) { + switch (type) { + case LOCK_OTYPE_IFILE: return &filelock; + case LOCK_OTYPE_IAUTH: return &authlock; + case LOCK_OTYPE_ILINK: return &linklock; + case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; + case LOCK_OTYPE_IDIR: return &dirlock; + default: assert(0); + } } + void set_mlock_info(MLock *m); + void encode_lock_state(int type, bufferlist& bl); + void decode_lock_state(int type, bufferlist& bl); + // -- caps -- (new) // client caps + bool is_any_caps() { return !client_caps.empty(); } map& get_client_caps() { return client_caps; } void add_client_cap(int client, Capability& cap) { if (client_caps.empty()) @@ -372,36 +345,30 @@ protected: void replicate_relax_locks() { + dout(10) << " relaxing locks on " << *this << endl; assert(is_auth()); assert(!is_replicated()); - dout(10) << " relaxing locks on " << *this << endl; - if (hardlock.get_state() == LOCK_LOCK && - !hardlock.is_used()) { - dout(10) << " hard now sync " << *this << endl; - hardlock.set_state(LOCK_SYNC); - } - if (filelock.get_state() == LOCK_LOCK) { - if (!filelock.is_used() && - (get_caps_issued() & CAP_FILE_WR) == 0) { - filelock.set_state(LOCK_SYNC); - dout(10) << " file now sync " << *this << endl; - } else { - dout(10) << " can't relax filelock on " << *this << endl; - } - } + authlock.replicate_relax(); + linklock.replicate_relax(); + dirfragtreelock.replicate_relax(); + + if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0) + filelock.replicate_relax(); + + dirlock.replicate_relax(); } // -- authority -- - int authority(); + pair authority(); // -- auth pins -- int is_auth_pinned() { return auth_pins; } - int adjust_nested_auth_pins(int a); + void adjust_nested_auth_pins(int a); bool can_auth_pin(); void auth_pin(); void auth_unpin(); @@ -419,13 +386,10 @@ protected: linked to an active_request, so they're automatically cleaned up when a request is finished. pin at will! */ void request_pin_get() { - if (num_request_pins == 0) get(PIN_REQUEST); - num_request_pins++; + get(PIN_REQUEST); } void request_pin_put() { - num_request_pins--; - if (num_request_pins == 0) put(PIN_REQUEST); - assert(num_request_pins >= 0); + put(PIN_REQUEST); } void bad_put(int by) { @@ -443,30 +407,20 @@ protected: // -- hierarchy stuff -- private: - void get_parent(); - void put_parent(); + //void get_parent(); + //void put_parent(); public: void set_primary_parent(CDentry *p) { assert(parent == 0); parent = p; - get_parent(); } void remove_primary_parent(CDentry *dn) { assert(dn == parent); parent = 0; - put_parent(); - } - void add_remote_parent(CDentry *p) { - if (remote_parents.empty()) - get_parent(); - remote_parents.insert(p); - } - void remove_remote_parent(CDentry *p) { - remote_parents.erase(p); - if (remote_parents.empty()) - put_parent(); } + void add_remote_parent(CDentry *p); + void remove_remote_parent(CDentry *p); int num_remote_parents() { return remote_parents.size(); } @@ -483,8 +437,8 @@ public: } */ - // dbg - void dump(int d = 0); + void print(ostream& out); + }; @@ -497,19 +451,31 @@ public: class CInodeDiscover { inode_t inode; + string symlink; + fragtree_t dirfragtree; + int replica_nonce; - int hardlock_state; + int authlock_state; + int linklock_state; + int dirfragtreelock_state; int filelock_state; + int dirlock_state; public: CInodeDiscover() {} CInodeDiscover(CInode *in, int nonce) { inode = in->inode; + symlink = in->symlink; + dirfragtree = in->dirfragtree; + replica_nonce = nonce; - hardlock_state = in->hardlock.get_replica_state(); + authlock_state = in->authlock.get_replica_state(); + linklock_state = in->linklock.get_replica_state(); + dirfragtreelock_state = in->dirfragtreelock.get_replica_state(); filelock_state = in->filelock.get_replica_state(); + dirlock_state = in->dirlock.get_replica_state(); } inodeno_t get_ino() { return inode.ino; } @@ -517,28 +483,39 @@ class CInodeDiscover { void update_inode(CInode *in) { in->inode = inode; + in->symlink = symlink; + in->dirfragtree = dirfragtree; in->replica_nonce = replica_nonce; - in->hardlock.set_state(hardlock_state); + in->authlock.set_state(authlock_state); + in->linklock.set_state(linklock_state); + in->dirfragtreelock.set_state(dirfragtreelock_state); in->filelock.set_state(filelock_state); + in->dirlock.set_state(dirlock_state); } void _encode(bufferlist& bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&replica_nonce, sizeof(replica_nonce)); - bl.append((char*)&hardlock_state, sizeof(hardlock_state)); - bl.append((char*)&filelock_state, sizeof(filelock_state)); + ::_encode(inode, bl); + ::_encode(symlink, bl); + dirfragtree._encode(bl); + ::_encode(replica_nonce, bl); + ::_encode(authlock_state, bl); + ::_encode(linklock_state, bl); + ::_encode(dirfragtreelock_state, bl); + ::_encode(filelock_state, bl); + ::_encode(dirlock_state, bl); } void _decode(bufferlist& bl, int& off) { - bl.copy(off,sizeof(inode_t), (char*)&inode); - off += sizeof(inode_t); - bl.copy(off, sizeof(int), (char*)&replica_nonce); - off += sizeof(int); - bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); - off += sizeof(hardlock_state); - bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); - off += sizeof(filelock_state); + ::_decode(inode, bl, off); + ::_decode(symlink, bl, off); + dirfragtree._decode(bl, off); + ::_decode(replica_nonce, bl, off); + ::_decode(authlock_state, bl, off); + ::_decode(linklock_state, bl, off); + ::_decode(dirfragtreelock_state, bl, off); + ::_decode(filelock_state, bl, off); + ::_decode(dirlock_state, bl, off); } }; @@ -548,8 +525,9 @@ class CInodeDiscover { class CInodeExport { - struct { + struct st_ { inode_t inode; + meta_load_t popularity_justme; meta_load_t popularity_curdom; bool is_dirty; // dirty inode? @@ -557,21 +535,29 @@ class CInodeExport { int num_caps; } st; + string symlink; + fragtree_t dirfragtree; + map replicas; map cap_map; - CLock hardlock,filelock; - //int remaining_issued; + bufferlist locks; public: CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; + symlink = in->symlink; + dirfragtree = in->dirfragtree; + st.is_dirty = in->is_dirty(); replicas = in->replicas; - hardlock = in->hardlock; - filelock = in->filelock; + in->authlock._encode(locks); + in->linklock._encode(locks); + in->dirfragtreelock._encode(locks); + in->filelock._encode(locks); + in->dirlock._encode(locks); st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); @@ -582,13 +568,18 @@ public: in->take_client_caps(cap_map); //remaining_issued = in->get_caps_issued(); } - ~CInodeExport() { - } inodeno_t get_ino() { return st.inode.ino; } void update_inode(CInode *in, set& new_client_caps) { + // treat scatterlocked mtime special, since replica may have newer info + if (in->dirlock.get_state() == LOCK_SCATTER || + in->dirlock.get_state() == LOCK_GSYNCS) + st.inode.mtime = MAX(in->inode.mtime, st.inode.mtime); + in->inode = st.inode; + in->symlink = symlink; + in->dirfragtree = dirfragtree; in->popularity[MDS_POP_JUSTME] += st.popularity_justme; in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; @@ -600,10 +591,14 @@ public: in->replicas = replicas; if (!replicas.empty()) - in->get(CInode::PIN_CACHED); + in->get(CInode::PIN_REPLICATED); - in->hardlock = hardlock; - in->filelock = filelock; + int off = 0; + in->authlock._decode(locks, off); + in->linklock._decode(locks, off); + in->dirfragtreelock._decode(locks, off); + in->filelock._decode(locks, off); + in->dirlock._decode(locks, off); // caps in->merge_client_caps(cap_map, new_client_caps); @@ -611,13 +606,12 @@ public: void _encode(bufferlist& bl) { st.num_caps = cap_map.size(); - bl.append((char*)&st, sizeof(st)); - - // cached_by + nonce - ::_encode(replicas, bl); - hardlock.encode_state(bl); - filelock.encode_state(bl); + ::_encode(st, bl); + ::_encode(symlink, bl); + dirfragtree._encode(bl); + ::_encode(replicas, bl); + ::_encode(locks, bl); // caps for (map::iterator it = cap_map.begin(); @@ -629,13 +623,11 @@ public: } int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - + ::_decode(st, bl, off); + ::_decode(symlink, bl, off); + dirfragtree._decode(bl, off); ::_decode(replicas, bl, off); - - hardlock.decode_state(bl, off); - filelock.decode_state(bl, off); + ::_decode(locks, bl, off); // caps for (int i=0; i client_inst; - set client_mount; - hash_map client_ref; - - void inc_ref(int client, const entity_inst_t& inst) { - if (client_inst.count(client)) { - assert(client_inst[client] == inst); - assert(client_ref.count(client)); - } else { - client_inst[client] = inst; - } - client_ref[client]++; +private: + version_t version; + version_t projected; + version_t committing; + version_t committed; + map > commit_waiters; + +public: + ClientMap() : version(0), projected(0), committing(0), committed(0) {} + + version_t get_version() { return version; } + version_t get_projected() { return projected; } + version_t get_committing() { return committing; } + version_t get_committed() { return committed; } + + version_t inc_projected() { return ++projected; } + void reset_projected() { projected = version; } + void set_committing(version_t v) { committing = v; } + void set_committed(version_t v) { committed = v; } + + void add_commit_waiter(Context *c) { + commit_waiters[committing].push_back(c); } - void dec_ref(int client) { - assert(client_ref.count(client)); - assert(client_ref[client] > 0); - client_ref[client]--; - if (client_ref[client] == 0) { - client_ref.erase(client); - client_inst.erase(client); - } + void take_commit_waiters(version_t v, list& ls) { + ls.swap(commit_waiters[v]); + commit_waiters.erase(v); } - + +private: + // effects version + hash_map client_inst; + set sessions; + set opening; + set closing; + public: + bool empty() { + return client_inst.empty(); + } + const entity_inst_t& get_inst(int client) { assert(client_inst.count(client)); return client_inst[client]; } - const set& get_mount_set() { return client_mount; } + const set& get_session_set() { return sessions; } - void add_mount(int client, const entity_inst_t& inst) { - inc_ref(client, inst); - client_mount.insert(client); + bool is_opening(int c) { return opening.count(c); } + void add_opening(int c) { opening.insert(c); } + bool is_closing(int c) { return closing.count(c); } + void add_closing(int c) { closing.insert(c); } + bool have_session(int client) { + return client_inst.count(client); } - void rem_mount(int client) { - dec_ref(client); - client_mount.erase(client); + void open_session(const entity_inst_t& inst) { + opening.erase(inst.name.num()); + client_inst[inst.name.num()] = inst; + sessions.insert(inst.name.num()); + version++; + } + void close_session(int client) { + closing.erase(client); + sessions.erase(client); + client_inst.erase(client); + version++; } - - void add_open(int client, const entity_inst_t& inst) { - inc_ref(client, inst); +private: + // -- completed requests -- + // client id -> tid -> result code + map > completed_requests; // completed client requests + map > waiting_for_trim; + +public: + void add_completed_request(metareqid_t ri) { + completed_requests[ri.client].insert(ri.tid); } - void dec_open(int client) { - dec_ref(client); + void trim_completed_requests(int client, + tid_t mintid) { // zero means trim all! + map >::iterator p = completed_requests.find(client); + if (p == completed_requests.end()) + return; + + // trim + while (!p->second.empty() && (mintid == 0 || *p->second.begin() < mintid)) + p->second.erase(p->second.begin()); + if (p->second.empty()) + completed_requests.erase(p); + + // kick waiters + map >::iterator q = waiting_for_trim.find(client); + if (q != waiting_for_trim.end()) { + list fls; + while (!q->second.empty() && + (mintid == 0 || q->second.begin()->first < mintid)) { + fls.push_back(q->second.begin()->second); + q->second.erase(q->second.begin()); + } + if (q->second.empty()) + waiting_for_trim.erase(q); + finish_contexts(fls); + } + } + void add_trim_waiter(metareqid_t ri, Context *c) { + waiting_for_trim[ri.client][ri.tid] = c; + } + bool have_completed_request(metareqid_t ri) { + return completed_requests.count(ri.client) && + completed_requests[ri.client].count(ri.tid); + } + + + // -- encoding -- + void encode(bufferlist& bl) { + bl.append((char*)&version, sizeof(version)); + ::_encode(client_inst, bl); + ::_encode(sessions, bl); + } + void decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + ::_decode(client_inst, bl, off); + ::_decode(sessions, bl, off); + + projected = committing = committed = version; } }; diff --git a/trunk/ceph/mds/FileLock.h b/trunk/ceph/mds/FileLock.h new file mode 100644 index 0000000000000..8499280147e3e --- /dev/null +++ b/trunk/ceph/mds/FileLock.h @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FILELOCK_H +#define __FILELOCK_H + +#include +#include +using namespace std; + +#include "include/buffer.h" + +#include "SimpleLock.h" +#include "Capability.h" + +// states and such. +// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio + +// -----auth-------- ---replica------- +#define LOCK_SYNC_ 1 // AR R . / C R . . . L R . / C R . . . L stat() +#define LOCK_GSYNCL -12 // A . . / C ? . . . L loner -> sync (*) +#define LOCK_GSYNCM -13 // A . . / . R . . . L + +#define LOCK_LOCK_ 2 // AR R W / C . . . . . . . / C . . . . . truncate() +#define LOCK_GLOCKR_ -3 // AR R . / C . . . . . . . / C . . . . . +#define LOCK_GLOCKL -4 // A . . / C . . . . . loner -> lock +#define LOCK_GLOCKM -5 // A . . / . . . . . . + +#define LOCK_MIXED 6 // AR . . / . R W A . L . . / . R . . . L +#define LOCK_GMIXEDR -7 // AR R . / . R . . . L . . / . R . . . L +#define LOCK_GMIXEDL -8 // A . . / . . . . . L loner -> mixed + +#define LOCK_LONER 9 // A . . / C R W A B L (lock) +#define LOCK_GLONERR -10 // A . . / . R . . . L +#define LOCK_GLONERM -11 // A . . / . R W A . L + +// (*) FIXME: how to let old loner keep R, somehow, during GSYNCL + +// 4 stable +// +9 transition +// 13 total + +inline const char *get_filelock_state_name(int n) { + switch (n) { + case LOCK_SYNC: return "sync"; + case LOCK_GSYNCL: return "gsyncl"; + case LOCK_GSYNCM: return "gsyncm"; + case LOCK_LOCK: return "lock"; + case LOCK_GLOCKR: return "glockr"; + case LOCK_GLOCKL: return "glockl"; + case LOCK_GLOCKM: return "glockm"; + case LOCK_MIXED: return "mixed"; + case LOCK_GMIXEDR: return "gmixedr"; + case LOCK_GMIXEDL: return "gmixedl"; + case LOCK_LONER: return "loner"; + case LOCK_GLONERR: return "glonerr"; + case LOCK_GLONERM: return "glonerm"; + default: assert(0); + } +} + + +/* no append scenarios: + +loner + truncate(): + - loner needs to lose A (?unless it's the loner doing the truncate?) +loner + statlite(size): + - loner needs to lose A + +any + statlite(size) + - all lose A + +any + statlite(mtime) + - all lose W + +-> we need to add lonerfixed and mixedfixed states (and associated transitions) + in order to efficiently support statlite(size) and truncate(). until then, + we have to LOCK. + + */ + +// -- lock... hard or file + +class MDRequest; + +class FileLock : public SimpleLock { + public: + FileLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) { } + + char get_replica_state() { + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKM: + case LOCK_GLOCKL: + case LOCK_GLOCKR: + case LOCK_LONER: + case LOCK_GLONERR: + case LOCK_GLONERM: + return LOCK_LOCK; + case LOCK_MIXED: + case LOCK_GMIXEDR: + return LOCK_MIXED; + case LOCK_SYNC: + return LOCK_SYNC; + + // after gather auth will bc LOCK_AC_MIXED or whatever + case LOCK_GSYNCM: + return LOCK_MIXED; + case LOCK_GSYNCL: + case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. + return LOCK_LOCK; + + default: + assert(0); + } + return 0; + } + + + // read/write access + bool can_rdlock(MDRequest *mdr) { + if (!parent->is_auth()) + return (state == LOCK_SYNC); + if (state == LOCK_LOCK && mdr && xlock_by == mdr) + return true; + if (state == LOCK_LOCK && !xlock_by) + return true; + return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) + || (state == LOCK_GLOCKR); + } + bool can_rdlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKL); + else + return false; + } + bool can_xlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) + || (state == LOCK_GLOCKM); + else + return false; + } + + // client caps allowed + int caps_allowed_ever() { + if (parent->is_auth()) + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + else + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + } + int caps_allowed() { + if (parent->is_auth()) + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + case LOCK_GLOCKL: + return CAP_FILE_RDCACHE; + + case LOCK_GLOCKM: + return 0; + + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + case LOCK_GMIXEDR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GMIXEDL: + return 0; + + case LOCK_LONER: // single client writer, of course. + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + case LOCK_GLONERR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GLONERM: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + + case LOCK_GSYNCL: + return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; + case LOCK_GSYNCM: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + else + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; + case LOCK_GMIXEDR: + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + assert(0); + return 0; + } + + void print(ostream& out) { + out << "("; + //out << get_lock_type_name(l.get_type()) << " "; + out << get_filelock_state_name(get_state()); + if (!get_gather_set().empty()) out << " g=" << get_gather_set(); + if (is_rdlocked()) + out << " r=" << get_num_rdlocks(); + if (is_xlocked()) + out << " x=" << get_xlocked_by(); + out << ")"; + } +}; + + +#endif diff --git a/trunk/ceph/mds/Hasher.cc b/trunk/ceph/mds/Hasher.cc new file mode 100644 index 0000000000000..08e0c7f21a776 --- /dev/null +++ b/trunk/ceph/mds/Hasher.cc @@ -0,0 +1,1580 @@ + + +// ======================================================================= +// HASHING + + +void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) +{ + int off = 0; + + for (; nden>0; nden--) { + // dentry + string dname; + _decode(dname, bl, off); + dout(15) << "dname is " << dname << endl; + + char icode; + bl.copy(off, 1, &icode); + off++; + + CDentry *dn = dir->lookup(dname); + if (!dn) + dn = dir->add_dentry(dname); // null + + // mark dn dirty _after_ we link the inode (scroll down) + + if (icode == 'N') { + + // null dentry + assert(dn->is_null()); + + // fall thru + } + else if (icode == 'L') { + // remote link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + dir->link_inode(dn, ino); + } + else if (icode == 'I') { + // inode + decode_import_inode(dn, bl, off, oldauth); + + // fix up subdir export? + if (dn->inode->dir) { + assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTBOUND)); + dn->inode->dir->put(CDir::PIN_IMPORTBOUND); + dn->inode->dir->state_clear(CDIR_STATE_IMPORTBOUND); + + if (dn->inode->dir->is_auth()) { + // mine. must have been an import. + assert(dn->inode->dir->is_import()); + dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; + dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); + cache->imports.erase(dn->inode->dir); + dn->inode->dir->put(CDir::PIN_IMPORT); + dn->inode->dir->state_clear(CDIR_STATE_IMPORT); + + // move nested under hashdir + for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); + it != cache->nested_exports[dn->inode->dir].end(); + it++) + cache->nested_exports[dir].insert(*it); + cache->nested_exports.erase(dn->inode->dir); + + // now it matches the inode + dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + else { + // not mine. make it an export. + dout(7) << "making subdir into export " << *dn->inode->dir << endl; + dn->inode->dir->get(CDir::PIN_EXPORT); + dn->inode->dir->state_set(CDIR_STATE_EXPORT); + cache->exports.insert(dn->inode->dir); + cache->nested_exports[dir].insert(dn->inode->dir); + + if (dn->inode->dir->get_dir_auth().first == CDIR_AUTH_PARENT) + dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode + assert(dn->inode->dir->get_dir_auth().first >= 0); + } + } + } + + // mark dentry dirty? (only _after_ we link the inode!) + dn->_mark_dirty(); // fixme + } +} + +/* + + notes on interaction of hashing and export/import: + + - dir->is_auth() is completely independent of hashing. for a hashed dir, + - all nodes are partially authoritative + - all nodes dir->is_hashed() == true + - all nodes dir->inode->dir_is_hashed() == true + - one node dir->is_auth() == true, the rest == false + - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. + + - remember simple rule: dir auth follows inode, unless dir_auth is explicit. + + - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) + - on export, -1 is changed to mds->get_nodeid() + - on import, nothing special, actually. + + - hashed dir files aren't included in export; subdirs are converted to imports + or exports as necessary. + - hashed dir subdirs are discovered on export. this is important + because dirs are needed to tie together auth hierarchy, for auth to know about + imports/exports, etc. + + - dir state is maintained on auth. + - COMPLETE and HASHED are transfered to importers. + - DIRTY is set everywhere. + + - hashed dir is like an import: hashed dir used for nested_exports map. + - nested_exports is updated appropriately on auth and replicas. + - a subtree terminates as a hashed dir, since the hashing explicitly + redelegates all inodes. thus export_dir_walk includes hashed dirs, but + not their inodes. +*/ + +// HASH on auth + +class C_MDC_HashFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->hash_dir_frozen(dir); + } +}; + +class C_MDC_HashComplete : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_HashComplete(Migrator *mig, CDir *dir) { + this->mig = mig; + this->dir = dir; + } + virtual void finish(int r) { + mig->hash_dir_complete(dir); + } +}; + + +/** hash_dir(dir) + * start hashing a directory. + */ +void Migrator::hash_dir(CDir *dir) +{ + dout(-7) << "hash_dir " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << " can't hash, freezing|frozen." << endl; + return; + } + + // pin path? + vector trace; + cache->make_trace(trace, dir->inode); + if (!cache->path_pin(trace, 0, 0)) { + dout(7) << "hash_dir couldn't pin path, failing." << endl; + return; + } + + // ok, go + dir->state_set(CDIR_STATE_HASHING); + dir->get(CDir::PIN_HASHING); + assert(dir->hashed_subset.empty()); + + // discover on all mds + assert(hash_gather.count(dir) == 0); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; // except me + hash_gather[dir].insert(i); + mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); + } + dir->auth_pin(); // pin until discovers are all acked. + + // start freeze + dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_HashComplete(this, dir)); + } else + hash_dir_complete(dir); +} + + +/* + * wait for everybody to discover and open the hashing dir + * then auth_unpin, to let the freeze happen + */ +void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = m->get_source().num(); + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; + dir->auth_unpin(); // unpin to allow freeze to complete + } else { + dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; + } + + delete m; // done +} + + + +/* + * once the dir is completely in memory, + * mark all migrating inodes dirty (to pin in cache) + */ +void Migrator::hash_dir_complete(CDir *dir) +{ + dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + + // mark dirty to pin in cache + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CInode *in = it->second->inode; + in->_mark_dirty(); // fixme + } + + if (dir->is_frozen_dir()) + hash_dir_go(dir); +} + + +/* + * once the dir is frozen, + * make sure it's complete + * send the prep messages! + */ +void Migrator::hash_dir_frozen(CDir *dir) +{ + dout(7) << "hash_dir_frozen " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + if (!dir->is_complete()) { + dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; + return; + } + + // send prep messages w/ export directories to open + vector msgs(mds->get_mds_map()->get_num_mds()); + + // check for subdirs + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + if (!in->is_dir()) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode == mds->get_nodeid()) continue; + + // msg? + if (msgs[dentryhashcode] == 0) { + msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); + } + msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); + } + + // send them! + assert(hash_gather[dir].empty()); + for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); + hash_gather[dir].insert(i); + } + } + + if (hash_gather[dir].empty()) { + // no subdirs! continue! + hash_gather.erase(dir); + hash_dir_go(dir); + } else { + // wait! + } +} + +/* + * wait for peers to open all subdirs + */ +void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = m->get_source().num(); + + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; + hash_dir_go(dir); + } else { + dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +/* + * once the dir is frozen, + * make sure it's complete + * do the hashing! + */ +void Migrator::hash_dir_go(CDir *dir) +{ + dout(7) << "hash_dir_go " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + // get messages to other nodes ready + vector msgs(mds->get_mds_map()->get_num_mds()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + msgs[i] = new MHashDir(dir->ino()); + } + + // pick a hash seed. + dir->inode->inode.hash_seed = 1;//dir->ino(); + + // suck up all waiters + C_Contexts *fin = new C_Contexts; + list waiting; + dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // get containing import. might be me. + CDir *containing_import = cache->get_auth_container(dir); + assert(containing_import != dir || dir->is_import()); + + // divy up contents + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode == mds->get_nodeid()) { + continue; // still mine! + } + + bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); + assert(bl); + + // -- dentry + dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; + _encode(it->first, *bl); + + // null dentry? + if (dn->is_null()) { + bl->append("N", 1); // null dentry + assert(dn->is_sync()); + continue; + } + + if (dn->is_remote()) { + // remote link + bl->append("L", 1); // remote link + + inodeno_t ino = dn->get_remote_ino(); + bl->append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + bl->append("I", 1); // inode dentry + + encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export + msgs[dentryhashcode]->inc_nden(); + + if (dn->is_dirty()) + dn->mark_clean(); + + // add to proxy + hash_proxy_inos[dir].push_back(in); + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); + + // fix up subdirs + if (in->dir) { + if (in->dir->is_auth()) { + // mine. make it into an import. + dout(7) << "making subdir into import " << *in->dir << endl; + in->dir->set_dir_auth( mds->get_nodeid() ); + cache->imports.insert(in->dir); + in->dir->get(CDir::PIN_IMPORT); + in->dir->state_set(CDIR_STATE_IMPORT); + + // fix nested bits + for (set::iterator it = cache->nested_exports[containing_import].begin(); + it != cache->nested_exports[containing_import].end(); ) { + CDir *ex = *it; + it++; + if (cache->get_auth_container(ex) == in->dir) { + dout(10) << "moving nested export " << *ex << endl; + cache->nested_exports[containing_import].erase(ex); + cache->nested_exports[in->dir].insert(ex); + } + } + } + else { + // not mine. + dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; + assert(in->dir->is_export()); + in->dir->put(CDir::PIN_EXPORT); + in->dir->state_clear(CDIR_STATE_EXPORT); + cache->exports.erase(in->dir); + cache->nested_exports[containing_import].erase(in->dir); + if (in->dir->authority() == dentryhashcode) + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + else + in->dir->set_dir_auth( in->dir->authority() ); + } + } + + // waiters + list waiters; + in->take_waiting(CINODE_WAIT_ANY, waiters); + fin->take(waiters); + } + + // dir state + dir->state_set(CDIR_STATE_HASHED); + dir->get(CDir::PIN_HASHED); + cache->hashdirs.insert(dir); + dir->mark_dirty(dir->pre_dirty()); // fixme + mds->mdlog->submit_entry(new EString("dirty dir fixme")); + + // inode state + if (dir->inode->is_auth()) { + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash dirty fixme")); + } + + // fix up nested_exports? + if (containing_import != dir) { + dout(7) << "moving nested exports under hashed dir" << endl; + for (set::iterator it = cache->nested_exports[containing_import].begin(); + it != cache->nested_exports[containing_import].end(); ) { + CDir *ex = *it; + it++; + if (cache->get_auth_container(ex) == dir) { + dout(7) << " moving nested export under hashed dir: " << *ex << endl; + cache->nested_exports[containing_import].erase(ex); + cache->nested_exports[dir].insert(ex); + } else { + dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; + } + } + } + + // send hash messages + assert(hash_gather[dir].empty()); + assert(hash_notify_gather[dir].empty()); + assert(dir->hashed_subset.empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + // all nodes hashed locally.. + dir->hashed_subset.insert(i); + + if (i == mds->get_nodeid()) continue; + + // init hash_gather and hash_notify_gather sets + hash_gather[dir].insert(i); + + assert(hash_notify_gather[dir][i].empty()); + for (int j=0; jget_mds_map()->get_num_mds(); j++) { + if (j == mds->get_nodeid()) continue; + if (j == i) continue; + hash_notify_gather[dir][i].insert(j); + } + + mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); + } + + // wait for all the acks. +} + + +void Migrator::handle_hash_dir_ack(MHashDirAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + assert(dir->is_hashed()); + assert(dir->is_hashing()); + + int from = m->get_source().num(); + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; + + if (hash_notify_gather[dir].empty()) { + dout(7) << "got notifies too, all done" << endl; + hash_dir_finish(dir); + } else { + dout(7) << "waiting on notifies " << endl; + } + + } else { + dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +void Migrator::hash_dir_finish(CDir *dir) +{ + dout(7) << "hash_dir_finish finishing " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_hashing()); + + // dir state + hash_gather.erase(dir); + dir->state_clear(CDIR_STATE_HASHING); + dir->put(CDir::PIN_HASHING); + dir->hashed_subset.clear(); + + // unproxy inodes + // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. + for (list::iterator it = hash_proxy_inos[dir].begin(); + it != hash_proxy_inos[dir].end(); + it++) { + CInode *in = *it; + assert(in->state_test(CInode::STATE_PROXY)); + in->state_clear(CInode::STATE_PROXY); + in->put(CInode::PIN_PROXY); + } + hash_proxy_inos.erase(dir); + + // unpin path + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + // unfreeze + dir->unfreeze_dir(); + + show_imports(); + assert(hash_gather.count(dir) == 0); + + // stats + //if (mds->logger) mds->logger->inc("nh", 1); + +} + + + + +// HASH on auth and non-auth + +void Migrator::handle_hash_dir_notify(MHashDirNotify *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + assert(dir->is_hashing()); + + dout(5) << "handle_hash_dir_notify " << *dir << endl; + int from = m->get_from(); + + int source = m->get_source().num(); + if (dir->is_auth()) { + // gather notifies + assert(dir->is_hashed()); + + assert( hash_notify_gather[dir][from].count(source) ); + hash_notify_gather[dir][from].erase(source); + + if (hash_notify_gather[dir][from].empty()) { + dout(7) << "last notify from " << from << endl; + hash_notify_gather[dir].erase(from); + + if (hash_notify_gather[dir].empty()) { + dout(7) << "last notify!" << endl; + hash_notify_gather.erase(dir); + + if (hash_gather[dir].empty()) { + dout(7) << "got acks too, all done" << endl; + hash_dir_finish(dir); + } else { + dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; + } + } else { + dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; + } + } else { + dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; + } + + // delete msg + delete m; + } else { + // update dir hashed_subset + assert(dir->hashed_subset.count(from) == 0); + dir->hashed_subset.insert(from); + + // update open subdirs + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->get_inode(); + if (!in) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != from) continue; // we'll import these in a minute + + if (in->dir->authority() != dentryhashcode) + in->dir->set_dir_auth( in->dir->authority() ); + else + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + + // remove from notify gather set + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + // last notify? + if (hash_gather[dir].empty()) { + dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; + hash_gather.erase(dir); + + dir->state_clear(CDIR_STATE_HASHING); + dir->put(CDir::PIN_HASHING); + dir->hashed_subset.clear(); + } else { + dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; + } + + // fw notify to auth + mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); + } +} + + + + +// HASH on non-auth + +/* + * discover step: + * each peer needs to open up the directory and pin it before we start + */ +class C_MDC_HashDirDiscover : public Context { + Migrator *mig; + MHashDirDiscover *m; +public: + vector trace; + C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { + this->mig = mig; + this->m = m; + } + void finish(int r) { + CInode *in = 0; + if (r >= 0) { + if (trace.size()) + in = trace[trace.size()-1]->get_inode(); + else + in = mig->cache->get_root(); + } + mig->handle_hash_dir_discover_2(m, in, r); + } +}; + +void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) +{ + assert(m->get_source().num() != mds->get_nodeid()); + + dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; + + // must discover it! + C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); + filepath fpath(m->get_path()); + cache->path_traverse(fpath, onfinish->trace, true, + m, new C_MDS_RetryMessage(mds,m), // on delay/retry + MDS_TRAVERSE_DISCOVER, + onfinish); // on completion|error +} + +void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) +{ + // yay! + if (in) { + dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; + } + + if (r < 0 || !in->is_dir()) { + dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; + assert(0); // this shouldn't happen if the auth pins his path properly!!!! + } + assert(in->is_dir()); + + // is dir open? + if (!in->dir) { + dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + return; + } + CDir *dir = in->dir; + + // pin dir, set hashing flag + dir->state_set(CDIR_STATE_HASHING); + dir->get(CDir::PIN_HASHING); + assert(dir->hashed_subset.empty()); + + // inode state + dir->inode->inode.hash_seed = 1;// dir->ino(); + if (dir->inode->is_auth()) { + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash dirty fixme")); + } + + // get gather set ready for notifies + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + if (i == dir->authority()) continue; + hash_gather[dir].insert(i); + } + + // reply + dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; + mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + delete m; +} + +/* + * prep step: + * peers need to open up all subdirs of the hashed dir + */ + +void Migrator::handle_hash_dir_prep(MHashDirPrep *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_hash_dir_prep " << *dir << endl; + + if (!m->did_assim()) { + m->mark_assim(); // only do this the first time! + + // assimilate dentry+inodes for exports + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + if (in) { + it->second->update_inode(in); + dout(5) << " updated " << *in << endl; + } else { + in = new CInode(mds->mdcache, false); + it->second->update_inode(in); + cache->add_inode(in); + + // link + dir->add_dentry( it->first, in ); + dout(5) << " added " << *in << endl; + } + + // open! + if (!in->dir) { + dout(5) << " opening nested export on " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + } + } + } + + // verify! + int waiting_for = 0; + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + assert(in); + + if (in->dir) { + if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { + dout(5) << " pinning nested export " << *in->dir << endl; + in->dir->get(CDir::PIN_IMPORTBOUND); + in->dir->state_set(CDIR_STATE_IMPORTBOUND); + } else { + dout(5) << " already pinned nested export " << *in << endl; + } + } else { + dout(5) << " waiting for nested export dir on " << *in << endl; + waiting_for++; + } + } + + if (waiting_for) { + dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; + return; + } + + // ack! + mds->send_message_mds(new MHashDirPrepAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // done. + delete m; +} + + +/* + * hash step: + */ + +void Migrator::handle_hash_dir(MHashDir *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + assert(!dir->is_auth()); + assert(!dir->is_hashed()); + assert(dir->is_hashing()); + + dout(5) << "handle_hash_dir " << *dir << endl; + int oldauth = m->get_source().num(); + + // content + import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); + + // dir state + dir->state_set(CDIR_STATE_HASHED); + dir->get(CDir::PIN_HASHED); + cache->hashdirs.insert(dir); + dir->hashed_subset.insert(mds->get_nodeid()); + + // dir is complete + dir->mark_complete(); + dir->mark_dirty(dir->pre_dirty()); // fixme + mds->mdlog->submit_entry(new EString("dirty dir fixme")); + + // commit + mds->mdstore->commit_dir(dir, 0); + + // send notifies + dout(7) << "sending notifies" << endl; + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + if (i == m->get_source().num()) continue; + mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), + i, MDS_PORT_MIGRATOR); + } + + // ack + dout(7) << "acking" << endl; + mds->send_message_mds(new MHashDirAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // done. + delete m; + + show_imports(); +} + + + + + +// UNHASH on auth + +class C_MDC_UnhashFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_frozen(dir); + } +}; + +class C_MDC_UnhashComplete : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_complete(dir); + } +}; + + +void Migrator::unhash_dir(CDir *dir) +{ + dout(-7) << "unhash_dir " << *dir << endl; + + assert(dir->is_hashed()); + assert(!dir->is_unhashing()); + assert(dir->is_auth()); + assert(hash_gather.count(dir)==0); + + // pin path? + vector trace; + cache->make_trace(trace, dir->inode); + if (!cache->path_pin(trace, 0, 0)) { + dout(7) << "unhash_dir couldn't pin path, failing." << endl; + return; + } + + // twiddle state + dir->state_set(CDIR_STATE_UNHASHING); + + // first, freeze the dir. + dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_UnhashComplete(this, dir)); + } else + unhash_dir_complete(dir); + +} + +void Migrator::unhash_dir_frozen(CDir *dir) +{ + dout(7) << "unhash_dir_frozen " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + if (!dir->is_complete()) { + dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; + } else + unhash_dir_prep(dir); +} + + +/* + * ask peers to freeze and complete hashed dir + */ +void Migrator::unhash_dir_prep(CDir *dir) +{ + dout(7) << "unhash_dir_prep " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + assert(dir->is_complete()); + + if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous + + // send unhash prep to all peers + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + hash_gather[dir].insert(i); + mds->send_message_mds(new MUnhashDirPrep(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + +/* + * wait for peers to freeze and complete hashed dirs + */ +void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = m->get_source().num(); + dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; + + if (!m->did_assim()) { + m->mark_assim(); // only do this the first time! + + // assimilate dentry+inodes for exports + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + if (in) { + it->second->update_inode(in); + dout(5) << " updated " << *in << endl; + } else { + in = new CInode(mds->mdcache, false); + it->second->update_inode(in); + cache->add_inode(in); + + // link + dir->add_dentry( it->first, in ); + dout(5) << " added " << *in << endl; + } + + // open! + if (!in->dir) { + dout(5) << " opening nested export on " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + } + } + } + + // verify! + int waiting_for = 0; + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + assert(in); + + if (in->dir) { + if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { + dout(5) << " pinning nested export " << *in->dir << endl; + in->dir->get(CDir::PIN_IMPORTBOUND); + in->dir->state_set(CDIR_STATE_IMPORTBOUND); + } else { + dout(5) << " already pinned nested export " << *in << endl; + } + } else { + dout(5) << " waiting for nested export dir on " << *in << endl; + waiting_for++; + } + } + + if (waiting_for) { + dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; + return; + } + + // ok, done with this PrepAck + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; + unhash_dir_go(dir); + } else { + dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +/* + * auth: + * send out MHashDir's to peers + */ +void Migrator::unhash_dir_go(CDir *dir) +{ + dout(7) << "unhash_dir_go " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + assert(dir->is_complete()); + + // send unhash prep to all peers + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + hash_gather[dir].insert(i); + mds->send_message_mds(new MUnhashDir(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + +/* + * auth: + * assimilate unhashing content + */ +void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_ack " << *dir << endl; + assert(dir->is_hashed()); + + // assimilate content + int from = m->get_source().num(); + import_hashed_content(dir, m->get_state(), m->get_nden(), from); + delete m; + + // done? + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; + return; + } + + // done! + + // fix up nested_exports + CDir *containing_import = cache->get_auth_container(dir); + if (containing_import != dir) { + for (set::iterator it = cache->nested_exports[dir].begin(); + it != cache->nested_exports[dir].end(); + it++) { + dout(7) << "moving nested export out from under hashed dir : " << **it << endl; + cache->nested_exports[containing_import].insert(*it); + } + cache->nested_exports.erase(dir); + } + + // dir state + //dir->state_clear(CDIR_STATE_UNHASHING); //later + dir->state_clear(CDIR_STATE_HASHED); + dir->put(CDir::PIN_HASHED); + cache->hashdirs.erase(dir); + + // commit! + assert(dir->is_complete()); + //dir->mark_complete(); + dir->mark_dirty(dir->pre_dirty()); // fixme + mds->mdstore->commit_dir(dir, 0); + + // inode state + dir->inode->inode.hash_seed = 0; + if (dir->inode->is_auth()) { + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); + } + + // notify + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + + hash_gather[dir].insert(i); + + mds->send_message_mds(new MUnhashDirNotify(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + + +/* + * sent by peer to flush mds links. unfreeze when all gathered. + */ +void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_ack " << *dir << endl; + assert(!dir->is_hashed()); + assert(dir->is_unhashing()); + assert(dir->is_frozen_dir()); + + // done? + int from = m->get_source().num(); + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + delete m; + + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; + } else { + unhash_dir_finish(dir); + } +} + + +/* + * all mds links are flushed. unfreeze dir! + */ +void Migrator::unhash_dir_finish(CDir *dir) +{ + dout(7) << "unhash_dir_finish " << *dir << endl; + hash_gather.erase(dir); + + // unpin path + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + // state + dir->state_clear(CDIR_STATE_UNHASHING); + + // unfreeze + dir->unfreeze_dir(); + +} + + + +// UNHASH on all + +/* + * hashed dir is complete. + * mark all migrating inodes dirty (to pin in cache) + * if frozen too, then go to next step (depending on auth) + */ +void Migrator::unhash_dir_complete(CDir *dir) +{ + dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; + + assert(dir->is_hashed()); + assert(dir->is_complete()); + + // mark dirty to pin in cache + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CInode *in = it->second->inode; + if (in->is_auth()) { + in->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("unhash dirty fixme")); + } + } + + if (!dir->is_frozen_dir()) { + dout(7) << "dir complete but !frozen, waiting " << *dir << endl; + } else { + if (dir->is_auth()) + unhash_dir_prep(dir); // auth + else + unhash_dir_prep_finish(dir); // nonauth + } +} + + +// UNHASH on non-auth + +class C_MDC_UnhashPrepFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_prep_frozen(dir); + } +}; + + +/* + * peers need to freeze their dir and make them complete + */ +void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_prep " << *dir << endl; + assert(dir->is_hashed()); + + // freeze + dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_UnhashComplete(this, dir)); + } else { + unhash_dir_complete(dir); + } + + delete m; +} + +/* + * peer has hashed dir frozen. + * complete too? + */ +void Migrator::unhash_dir_prep_frozen(CDir *dir) +{ + dout(7) << "unhash_dir_prep_frozen " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_frozen_dir()); + assert(!dir->is_auth()); + + if (!dir->is_complete()) { + dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; + } else + unhash_dir_prep_finish(dir); +} + +/* + * peer has hashed dir complete and frozen. ack. + */ +void Migrator::unhash_dir_prep_finish(CDir *dir) +{ + dout(7) << "unhash_dir_prep_finish " << *dir << endl; + assert(dir->is_hashed()); + assert(!dir->is_auth()); + assert(dir->is_frozen()); + assert(dir->is_complete()); + + // twiddle state + if (dir->is_unhashing()) + return; // already replied. + dir->state_set(CDIR_STATE_UNHASHING); + + // send subdirs back to auth + MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); + int auth = dir->authority(); + + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + if (!in->is_dir()) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != mds->get_nodeid()) continue; + + // msg? + ack->add_inode(it->first, in->replicate_to(auth)); + } + + // ack + mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); +} + + + +/* + * peer needs to send hashed dir content back to auth. + * unhash dir. + */ +void Migrator::handle_unhash_dir(MUnhashDir *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; + assert(dir->is_hashed()); + assert(dir->is_unhashing()); + assert(!dir->is_auth()); + + // get message ready + bufferlist bl; + int nden = 0; + + // suck up all waiters + C_Contexts *fin = new C_Contexts; + list waiting; + dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // divy up contents + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != mds->get_nodeid()) { + // not mine! + // twiddle dir_auth? + if (in->dir) { + if (in->dir->authority() != dir->authority()) + in->dir->set_dir_auth( in->dir->authority() ); + else + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + continue; + } + + // -- dentry + dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; + _encode(it->first, bl); + + // null dentry? + if (dn->is_null()) { + bl.append("N", 1); // null dentry + assert(dn->is_sync()); + continue; + } + + if (dn->is_remote()) { + // remote link + bl.append("L", 1); // remote link + + inodeno_t ino = dn->get_remote_ino(); + bl.append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + bl.append("I", 1); // inode dentry + + encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export + nden++; + + if (dn->is_dirty()) + dn->mark_clean(); + + // proxy + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); + hash_proxy_inos[dir].push_back(in); + + if (in->dir) { + if (in->dir->is_auth()) { + // mine. make it into an import. + dout(7) << "making subdir into import " << *in->dir << endl; + in->dir->set_dir_auth( mds->get_nodeid() ); + cache->imports.insert(in->dir); + in->dir->get(CDir::PIN_IMPORT); + in->dir->state_set(CDIR_STATE_IMPORT); + } + else { + // not mine. + dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; + assert(in->dir->is_export()); + in->dir->put(CDir::PIN_EXPORT); + in->dir->state_clear(CDIR_STATE_EXPORT); + cache->exports.erase(in->dir); + cache->nested_exports[dir].erase(in->dir); + } + } + + // waiters + list waiters; + in->take_waiting(CINODE_WAIT_ANY, waiters); + fin->take(waiters); + } + + // we should have no nested exports; we're not auth for the dir! + assert(cache->nested_exports[dir].empty()); + cache->nested_exports.erase(dir); + + // dir state + //dir->state_clear(CDIR_STATE_UNHASHING); // later + dir->state_clear(CDIR_STATE_HASHED); + dir->put(CDir::PIN_HASHED); + cache->hashdirs.erase(dir); + dir->mark_clean(); + + // inode state + dir->inode->inode.hash_seed = 0; + if (dir->inode->is_auth()) { + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); + } + + // init gather set + mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); + hash_gather[dir].erase(mds->get_nodeid()); + + // send unhash message + mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), + dir->authority(), MDS_PORT_MIGRATOR); +} + + +/* + * first notify comes from auth. + * send notifies to all other peers, with peer = self + * if we get notify from peer=other, remove from our gather list. + * when we've gotten notifies from everyone, + * unpin proxies, + * send notify_ack to auth. + * this ensures that all mds links are flushed of cache_expire type messages. + */ +void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_finish " << *dir << endl; + assert(!dir->is_hashed()); + assert(dir->is_unhashing()); + assert(!dir->is_auth()); + + int from = m->get_source().num(); + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + delete m; + + // did we send our shout out? + if (from == dir->authority()) { + // send notify to everyone else in weird chatter storm + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == from) continue; + if (i == mds->get_nodeid()) continue; + mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); + } + } + + // are we done? + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; + return; + } + hash_gather.erase(dir); + + // all done! + dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; + + // unpin proxies + for (list::iterator it = hash_proxy_inos[dir].begin(); + it != hash_proxy_inos[dir].end(); + it++) { + CInode *in = *it; + assert(in->state_test(CInode::STATE_PROXY)); + in->state_clear(CInode::STATE_PROXY); + in->put(CInode::PIN_PROXY); + } + + // unfreeze + dir->unfreeze_dir(); + + // ack + dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; + mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); + +} diff --git a/trunk/ceph/mds/Lock.h b/trunk/ceph/mds/Lock.h deleted file mode 100644 index 59d04d5b66eb7..0000000000000 --- a/trunk/ceph/mds/Lock.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCK_H -#define __LOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// basic lock -----auth-------- ---replica------- -#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . . - -// file lock states -#define LOCK_GLOCKL 3 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM 4 // A . . / . . . . . . -#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 8 // A . . / C R W A B L (lock) -#define LOCK_GLONERR 9 // A . . / . R . . . L -#define LOCK_GLONERM 10 // A . . / . R W A . L - -#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... -#define LOCK_GSYNCM 12 // A . . / . R . . . L - -// 4 stable -// +9 transition -// 13 total - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class Message; - -class CLock { - protected: - // lock state - char state; - set gather_set; // auth - - // local state - int nread; - Message *wrlock_by; - - - public: - CLock() : - state(LOCK_SYNC), - nread(0), - wrlock_by(0) { - } - - // encode/decode - void encode_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - _encode(gather_set, bl); - - //bl.append((char*)&nread, sizeof(nread)); - //bl.append((char*)&nwrite, sizeof(nwrite)); - } - void decode_state(bufferlist& bl, int& off) { - bl.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - _decode(gather_set, bl, off); - - //bl.copy(off, sizeof(nread), (char*)&nread); - //off += sizeof(nread); - //bl.copy(off, sizeof(nwrite), (char*)&nwrite); - //off += sizeof(nwrite); - } - - char get_state() { return state; } - char set_state(char s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - - char get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - - // gather set - set& get_gather_set() { return gather_set; } - void init_gather(const map& i) { - for (map::const_iterator p = i.begin(); p != i.end(); ++p) - gather_set.insert(p->first); - } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - - // ref counting - int get_read() { return ++nread; } - int put_read() { - assert(nread>0); - return --nread; - } - int get_nread() { return nread; } - - void get_write(Message *who) { - assert(wrlock_by == 0); - wrlock_by = who; - } - void put_write() { - assert(wrlock_by); - wrlock_by = 0; - } - bool is_wrlocked() { return wrlock_by ? true:false; } - Message *get_wrlocked_by() { return wrlock_by; } - bool is_used() { - return (is_wrlocked() || (nread>0)) ? true:false; - } - - - // stable - bool is_stable() { - return (state == LOCK_SYNC) || - (state == LOCK_LOCK) || - (state == LOCK_MIXED) || - (state == LOCK_LONER); - } - - // read/write access - bool can_read(bool auth) { - if (auth) - return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) - || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); - else - return (state == LOCK_SYNC); - } - bool can_read_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKL); - else - return false; - } - - bool can_write(bool auth) { - if (auth) - return (state == LOCK_LOCK) && !is_wrlocked(); - else - return false; - } - bool can_write_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever(bool auth) { - if (auth) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed(bool auth) { - if (auth) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - friend class MDCache; - friend class Locker; - friend class Migrator; -}; - -//ostream& operator<<(ostream& out, CLock& l); -inline ostream& operator<<(ostream& out, CLock& l) -{ - static char* __lock_states[] = { - "sync", - "lock", - "glockr", - "glockl", - "glockm", - "mixed", - "gmixedr", - "gmixedl", - "loner", - "glonerr", - "glonerm", - "gsyncl", - "gsyncm" - }; - - out << "(" << __lock_states[(int)l.get_state()]; - - if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); - - if (l.get_nread()) - out << " r=" << l.get_nread(); - if (l.is_wrlocked()) - out << " w=" << l.get_wrlocked_by(); - - // rw? - /* - out << " "; - if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; - out << "/"; - if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; - */ - out << ")"; - return out; -} - -#endif diff --git a/trunk/ceph/mds/Locker.cc b/trunk/ceph/mds/Locker.cc index f1ada7ea26913..475a323fcd80e 100644 --- a/trunk/ceph/mds/Locker.cc +++ b/trunk/ceph/mds/Locker.cc @@ -15,13 +15,10 @@ #include "MDS.h" #include "MDCache.h" #include "Locker.h" -#include "Server.h" #include "CInode.h" #include "CDir.h" #include "CDentry.h" -#include "Migrator.h" -#include "MDBalancer.h" #include "MDLog.h" #include "MDSMap.h" @@ -29,7 +26,6 @@ #include "events/EString.h" #include "events/EUpdate.h" -#include "events/EUnlink.h" #include "msg/Messenger.h" @@ -87,43 +83,256 @@ void Locker::dispatch(Message *m) } -void Locker::send_lock_message(CInode *in, int msg, int type) +void Locker::send_lock_message(SimpleLock *lock, int msg) { - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); + for (map::iterator it = lock->get_parent()->replicas_begin(); + it != lock->get_parent()->replicas_end(); it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); + MLock *m = new MLock(lock, msg, mds->get_nodeid()); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); } } - -void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data) +void Locker::send_lock_message(SimpleLock *lock, int msg, bufferlist &data) { - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); + for (map::iterator it = lock->get_parent()->replicas_begin(); + it != lock->get_parent()->replicas_end(); it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); + MLock *m = new MLock(lock, msg, mds->get_nodeid()); m->set_data(data); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); } } -void Locker::send_lock_message(CDentry *dn, int msg) + + + + + + + + + + +bool Locker::acquire_locks(MDRequest *mdr, + set &rdlocks, + set &wrlocks, + set &xlocks) { - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); + dout(10) << "acquire_locks " << *mdr << endl; + + // sort everything we will lock + set sorted; + + // (local) AUTH PINS + + // make list of items to authpin + set mustpin = xlocks; + for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) + mustpin.insert(*p); + + // can i auth pin them all now? + for (set::iterator p = mustpin.begin(); + p != mustpin.end(); + ++p) { + dout(10) << "must authpin " << **p << " " << *(*p)->get_parent() << endl; + + // sort in + sorted.insert(*p); + + if ((*p)->get_type() == LOCK_OTYPE_DN) { + CDir *dir = ((CDentry*)(*p)->get_parent())->dir; + dout(10) << "might auth_pin " << *dir << endl; + + if (!dir->is_auth()) continue; + if (!mdr->is_auth_pinned(dir) && + !dir->can_auth_pin()) { + // wait + dir->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } + } else { + CInode *in = (CInode*)(*p)->get_parent(); + if (!in->is_auth()) continue; + if (!mdr->is_auth_pinned(in) && + !in->can_auth_pin()) { + in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } + } + } + + // ok, grab the auth pins + for (set::iterator p = mustpin.begin(); + p != mustpin.end(); + ++p) { + if ((*p)->get_type() == LOCK_OTYPE_DN) { + CDir *dir = ((CDentry*)(*p)->get_parent())->dir; + if (!dir->is_auth()) continue; + dout(10) << "auth_pinning " << *dir << endl; + mdr->auth_pin(dir); + } else { + CInode *in = (CInode*)(*p)->get_parent(); + if (!in->is_auth()) continue; + dout(10) << "auth_pinning " << *in << endl; + mdr->auth_pin(in); + } } + + // sort in rdlocks too + for (set::iterator p = rdlocks.begin(); + p != rdlocks.end(); + ++p) { + dout(10) << "will rdlock " << **p << " " << *(*p)->get_parent() << endl; + sorted.insert(*p); + } + + // acquire locks. + // make sure they match currently acquired locks. + set::iterator existing = mdr->locks.begin(); + for (set::iterator p = sorted.begin(); + p != sorted.end(); + ++p) { + + // already locked? + if (existing != mdr->locks.end() && *existing == *p) { + // right kind? + SimpleLock *had = *existing; + if (xlocks.count(*p) == mdr->xlocks.count(*p) && + wrlocks.count(*p) == mdr->wrlocks.count(*p) && + rdlocks.count(*p) == mdr->rdlocks.count(*p)) { + dout(10) << "acquire_locks already locked " << *had << " " << *had->get_parent() << endl; + existing++; + continue; + } + } + + // hose any stray locks + while (existing != mdr->locks.end()) { + SimpleLock *had = *existing; + existing++; + dout(10) << "acquire_locks unlocking out-of-order " << **existing + << " " << *(*existing)->get_parent() << endl; + if (mdr->xlocks.count(had)) + xlock_finish(had, mdr); + else if (mdr->wrlocks.count(had)) + wrlock_finish(had, mdr); + else + rdlock_finish(had, mdr); + } + + // lock + if (xlocks.count(*p)) { + if (!xlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got xlock on " << **p << " " << *(*p)->get_parent() << endl; + } else if (wrlocks.count(*p)) { + if (!wrlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got wrlock on " << **p << " " << *(*p)->get_parent() << endl; + } else { + if (!rdlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got rdlock on " << **p << " " << *(*p)->get_parent() << endl; + } + } + + // any extra unneeded locks? + while (existing != mdr->locks.end()) { + dout(10) << "acquire_locks unlocking " << *existing + << " " << *(*existing)->get_parent() << endl; + if (mdr->xlocks.count(*existing)) + xlock_finish(*existing, mdr); + else if (mdr->wrlocks.count(*existing)) + wrlock_finish(*existing, mdr); + else + rdlock_finish(*existing, mdr); + } + + return true; } +// generics + +bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_rdlock_start((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIR: + return scatter_rdlock_start((ScatterLock*)lock, mdr); + default: + return simple_rdlock_start(lock, mdr); + } +} + +void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_rdlock_finish((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIR: + return scatter_rdlock_finish((ScatterLock*)lock, mdr); + default: + return simple_rdlock_finish(lock, mdr); + } +} + +bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IDIR: + return scatter_wrlock_start((ScatterLock*)lock, mdr); + default: + assert(0); + } +} + +void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IDIR: + return scatter_wrlock_finish((ScatterLock*)lock, mdr); + default: + assert(0); + } +} + +bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_xlock_start((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIR: + assert(0); + default: + return simple_xlock_start(lock, mdr); + } +} + +void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_xlock_finish((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIR: + assert(0); + default: + return simple_xlock_finish(lock, mdr); + } +} + + + + + + // file i/o ----------------------------------------- __uint64_t Locker::issue_file_data_version(CInode *in) @@ -152,10 +361,9 @@ Capability* Locker::issue_new_caps(CInode *in, Capability c(my_want); in->add_client_cap(my_client, c); cap = in->get_client_cap(my_client); - - // note client addr - mds->clientmap.add_open(my_client, req->get_client_inst()); - + + // suppress file cap messages for new cap (we'll bundle with the open() reply) + cap->set_suppress(true); } else { // make sure it has sufficient caps if (cap->wanted() & ~my_want) { @@ -164,13 +372,11 @@ Capability* Locker::issue_new_caps(CInode *in, } } - // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply) - cap->set_suppress(true); int before = cap->pending(); if (in->is_auth()) { // [auth] twiddle mode? - inode_file_eval(in); + file_eval(&in->filelock); } else { // [replica] tell auth about any new caps wanted request_inode_file_caps(in); @@ -207,7 +413,7 @@ Capability* Locker::issue_new_caps(CInode *in, bool Locker::issue_caps(CInode *in) { // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(in->is_auth()); + int allowed = in->filelock.caps_allowed(); dout(7) << "issue_caps filelock allows=" << cap_string(allowed) << " on " << *in << endl; @@ -285,7 +491,7 @@ void Locker::request_inode_file_caps(CInode *in) } assert(!in->is_auth()); - int auth = in->authority(); + int auth = in->authority().first; dout(7) << "request_inode_file_caps " << cap_string(wanted) << " was " << cap_string(in->replica_caps_wanted) << " on " << *in << " to mds" << auth << endl; @@ -304,22 +510,23 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) { CInode *in = mdcache->get_inode(m->get_ino()); assert(in); - assert(in->is_auth() || in->is_proxy()); + assert(in->is_auth());// || in->is_proxy()); dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; - if (in->is_proxy()) { + /*if (in->is_proxy()) { dout(7) << "proxy, fw" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER); + mds->send_message_mds(m, in->authority().first, MDS_PORT_LOCKER); return; } + */ if (m->get_caps()) in->mds_caps_wanted[m->get_from()] = m->get_caps(); else in->mds_caps_wanted.erase(m->get_from()); - inode_file_eval(in); + file_eval(&in->filelock); delete m; } @@ -350,7 +557,7 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) assert(cap); // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); + int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); dout(7) << "handle_client_file_caps seq " << m->get_seq() << " confirms caps " << cap_string(m->get_caps()) @@ -372,13 +579,10 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) if (!in->is_auth()) request_inode_file_caps(in); - // dec client addr counter - mds->clientmap.dec_open(client); - // tell client. MClientFileCaps *r = new MClientFileCaps(in->inode, 0, 0, 0, - MClientFileCaps::FILECAP_RELEASE); + MClientFileCaps::OP_RELEASE); mds->messenger->send_message(r, m->get_source_inst(), 0, MDS_PORT_LOCKER); } @@ -412,8 +616,8 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) } // reevaluate, waiters - inode_file_eval(in); - in->finish_waiting(CINODE_WAIT_CAPS, 0); + file_eval(&in->filelock); + in->finish_waiting(CInode::WAIT_CAPS, 0); delete m; } @@ -499,20 +703,60 @@ ALSO: void Locker::handle_lock(MLock *m) { switch (m->get_otype()) { - case LOCK_OTYPE_IHARD: - handle_lock_inode_hard(m); - break; + case LOCK_OTYPE_DN: + { + CDir *dir = mdcache->get_dirfrag(m->get_dirfrag()); + CDentry *dn = 0; + if (dir) + dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "dont' have dn " << m->get_dirfrag() << " " << m->get_dn() << endl; + delete m; + return; + } - case LOCK_OTYPE_IFILE: - handle_lock_inode_file(m); + handle_simple_lock(&dn->lock, m); + } break; - - case LOCK_OTYPE_DIR: - handle_lock_dir(m); + + case LOCK_OTYPE_IAUTH: + case LOCK_OTYPE_ILINK: + case LOCK_OTYPE_IDIRFRAGTREE: + case LOCK_OTYPE_IFILE: + { + CInode *in = mdcache->get_inode(m->get_ino()); + if (!in) { + dout(7) << "dont' have ino " << m->get_ino() << endl; + delete m; + return; + } + switch (m->get_otype()) { + case LOCK_OTYPE_IAUTH: + handle_simple_lock(&in->authlock, m); + break; + case LOCK_OTYPE_ILINK: + handle_simple_lock(&in->linklock, m); + break; + case LOCK_OTYPE_IDIRFRAGTREE: + handle_simple_lock(&in->dirfragtreelock, m); + break; + case LOCK_OTYPE_IFILE: + handle_file_lock(&in->filelock, m); + break; + } + } break; - - case LOCK_OTYPE_DN: - handle_lock_dn(m); + + case LOCK_OTYPE_IDIR: + { + CInode *in = mdcache->get_inode(m->get_ino()); + if (!in) { + dout(7) << "dont' have ino " << m->get_ino() << endl; + delete m; + return; + } + handle_scatter_lock(&in->dirlock, m); + } break; default: @@ -524,484 +768,812 @@ void Locker::handle_lock(MLock *m) -// =============================== -// hard inode metadata -bool Locker::inode_hard_read_try(CInode *in, Context *con) -{ - dout(7) << "inode_hard_read_try on " << *in << endl; - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) - return true; - - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_read_try waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, con); - return false; -} +// ========================================================================== +// simple lock -bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m) +void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) { - dout(7) << "inode_hard_read_start on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) { - in->hardlock.get_read(); - return true; - } + int from = m->get_asker(); - // can't read, and replicated. - assert(!in->is_auth()); + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK); + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC); + //|| lock->get_state() == LOCK_GLOCKR); + + // wait for readers to finish? + if (lock->is_rdlocked()) { + dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_GLOCKR); + lock->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m)); + return; + } - // wait! - dout(7) << "inode_hard_read_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in)); - return false; -} + // update lock and reply + lock->set_state(LOCK_LOCK); + + mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), + from, MDS_PORT_LOCKER); + break; -void Locker::inode_hard_read_finish(CInode *in) -{ - // drop ref - assert(in->hardlock.can_read(in->is_auth())); - in->hardlock.put_read(); + case LOCK_AC_REQXLOCKACK: + dout(7) << "handle_simple_lock got remote xlock on " + << *lock << " " << *lock->get_parent() << endl; + { + MDRequest *mdr = mdcache->request_get(m->get_reqid()); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); + } + break; - dout(7) << "inode_hard_read_finish on " << *in << endl; - - //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD); -} + // -- auth -- + case LOCK_AC_LOCKACK: + assert(lock->get_state() == LOCK_GLOCKR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; + } else { + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", last one" << endl; + simple_eval(lock); + } + break; + case LOCK_AC_REQXLOCK: + assert(lock->get_parent()->is_auth()); + { + // register request + MDRequest *mdr = mdcache->request_start(m); -bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_hard_write_start on " << *in << endl; + dout(7) << "handle_simple_lock " << m->get_source() << " " << *mdr << " requesting xlock " + << *lock << " on " << *lock->get_parent() + << endl; - // if not replicated, i can twiddle lock at will - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_LOCK) - in->hardlock.set_state(LOCK_LOCK); - - // can write? grab ref. - if (in->hardlock.can_write(in->is_auth())) { - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } + if (!simple_xlock_start(lock, mdr)) + return; - in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* - in->hardlock.get_write(m); - return true; - } - - // can't write, replicated. - if (in->is_auth()) { - // auth - if (in->hardlock.can_write_soon(in->is_auth())) { - // just wait - } else { - // initiate lock - inode_hard_lock(in); + // ack + MLock *m = new MLock(lock, LOCK_AC_REQXLOCKACK, mds->get_nodeid()); + mds->send_message_mds(m, mdr->request->get_source().num(), MDS_PORT_LOCKER); } - - dout(7) << "inode_hard_write_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in)); + return; - return false; - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - } -} + case LOCK_AC_UNXLOCK: + assert(lock->get_parent()->is_auth()); + { + // get request + MDRequest *mdr = mdcache->request_get(m->get_reqid()); + dout(7) << "handle_simple_lock " << m->get_source() << " " << *mdr << " dropping xlock " + << *lock << " on " << *lock->get_parent() + + << endl; -void Locker::inode_hard_write_finish(CInode *in) -{ - // drop ref - //assert(in->hardlock.can_write(in->is_auth())); - in->hardlock.put_write(); - in->auth_unpin(); - dout(7) << "inode_hard_write_finish on " << *in << endl; + simple_xlock_finish(lock, mdr); + + if (mdr->locks.empty()) + mdcache->request_finish(mdr); + + } + return; - // others waiting? - if (in->is_hardlock_write_wanted()) { - // wake 'em up - in->take_waiting(CINODE_WAIT_HARDW, mds->finished_queue); - } else { - // auto-sync if alone. - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_SYNC) - in->hardlock.set_state(LOCK_SYNC); - - inode_hard_eval(in); } + + delete m; } -void Locker::inode_hard_eval(CInode *in) +void Locker::simple_eval(SimpleLock *lock) { // finished gather? - if (in->is_auth() && - !in->hardlock.is_stable() && - in->hardlock.gather_set.empty()) { - dout(7) << "inode_hard_eval finished gather on " << *in << endl; - switch (in->hardlock.get_state()) { + if (lock->get_parent()->is_auth() && + !lock->is_stable() && + !lock->is_gathering()) { + dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << endl; + switch (lock->get_state()) { case LOCK_GLOCKR: - in->hardlock.set_state(LOCK_LOCK); - - // waiters - //in->hardlock.get_write(); - in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); - //in->hardlock.put_write(); + lock->set_state(LOCK_LOCK); + lock->finish_waiters(SimpleLock::WAIT_STABLE); break; default: assert(0); } } - if (!in->hardlock.is_stable()) return; + if (!lock->is_stable()) return; - if (in->is_auth()) { - + if (lock->get_parent()->is_auth()) { + // sync? - if (in->is_replicated() && - in->is_hardlock_write_wanted() && - in->hardlock.get_state() != LOCK_SYNC) { - dout(7) << "inode_hard_eval stable, syncing " << *in << endl; - inode_hard_sync(in); + if (lock->get_state() != LOCK_SYNC && + lock->get_parent()->is_replicated() && + !lock->is_waiter_for(SimpleLock::WAIT_WR)) { + dout(7) << "simple_eval stable, syncing " << *lock + << " on " << *lock->get_parent() << endl; + simple_sync(lock); } } else { // replica } + } // mid -void Locker::inode_hard_sync(CInode *in) +void Locker::simple_sync(SimpleLock *lock) { - dout(7) << "inode_hard_sync on " << *in << endl; - assert(in->is_auth()); + dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); // check state - if (in->hardlock.get_state() == LOCK_SYNC) + if (lock->get_state() == LOCK_SYNC) return; // already sync - if (in->hardlock.get_state() == LOCK_GLOCKR) + if (lock->get_state() == LOCK_GLOCKR) assert(0); // um... hmm! - assert(in->hardlock.get_state() == LOCK_LOCK); + assert(lock->get_state() == LOCK_LOCK); // hard data - bufferlist harddata; - in->encode_hard_state(harddata); + bufferlist data; + lock->encode_locked_state(data); // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata); + send_lock_message(lock, LOCK_AC_SYNC, data); // change lock - in->hardlock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); // waiters? - in->finish_waiting(CINODE_WAIT_HARDSTABLE); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } -void Locker::inode_hard_lock(CInode *in) +void Locker::simple_lock(SimpleLock *lock) { - dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; - assert(in->is_auth()); + dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); // check state - if (in->hardlock.get_state() == LOCK_LOCK || - in->hardlock.get_state() == LOCK_GLOCKR) + if (lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_GLOCKR) return; // already lock or locking - assert(in->hardlock.get_state() == LOCK_SYNC); + assert(lock->get_state() == LOCK_SYNC); - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD); - - // change lock - in->hardlock.set_state(LOCK_GLOCKR); - in->hardlock.init_gather(in->get_replicas()); + if (lock->get_parent()->is_replicated()) { + // bcast to replicas + send_lock_message(lock, LOCK_AC_LOCK); + + // change lock + lock->set_state(LOCK_GLOCKR); + lock->init_gather(); + } else { + lock->set_state(LOCK_LOCK); + } } +// top +bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) +{ + dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << endl; + // can read? grab ref. + if (lock->can_rdlock(0)) + return true; + + assert(!lock->get_parent()->is_auth()); -// messenger + // wait! + dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, con); + return false; +} -void Locker::handle_lock_inode_hard(MLock *m) +bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) { - assert(m->get_otype() == LOCK_OTYPE_IHARD); + dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << endl; + + // can read? grab ref. + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } - if (mds->logger) mds->logger->inc("lih"); + // can't read, and replicated. + assert(!lock->get_parent()->is_auth()); - int from = m->get_asker(); - CInode *in = mdcache->get_inode(m->get_ino()); + // wait! + dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return false; +} + +void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + // drop ref + lock->put_rdlock(); + if (mdr) { + mdr->rdlocks.erase(lock); + mdr->locks.erase(lock); + } + + dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (LOCK_AC_FOR_AUTH(m->get_action())) { + if (lock->get_state() == LOCK_GLOCKR && + !lock->is_rdlocked()) { + lock->set_state(LOCK_SYNC); // return state to sync, in case the unpinner flails + lock->finish_waiters(SimpleLock::WAIT_NOLOCKS); + } +} + +bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) +{ + dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; + + // xlock by me? + if (lock->is_xlocked() && + lock->get_xlocked_by() == mdr) + return true; + + // auth? + if (lock->get_parent()->is_auth()) { // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); + + // lock. + if (lock->get_state() == LOCK_SYNC) + simple_lock(lock); + + // already locked? + if (lock->get_state() == LOCK_LOCK) { + if (lock->is_xlocked()) { + // by someone else. + lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - return; + + // xlock. + lock->get_xlock(mdr); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } else { + // wait for lock + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } } else { // replica - if (!in) { - dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; - /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness - to keep gather_set a proper/correct subset of cached_by. better to use the existing - cacheexpire mechanism instead! - */ - delete m; - return; + + // wait for single auth + if (lock->get_parent()->is_ambiguous_auth()) { + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, + new C_MDS_RetryRequest(mdcache, mdr)); + return false; } + + // wait for sync. + // (???????????) + if (lock->get_state() != LOCK_SYNC) { + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + // send lock request + int auth = lock->get_parent()->authority().first; + MLock *m = new MLock(lock, LOCK_AC_REQXLOCK, mds->get_nodeid()); + mds->send_message_mds(m, auth, MDS_PORT_LOCKER); + + // wait + lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } +} + + +void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + // drop ref + assert(lock->can_xlock(mdr)); + lock->put_xlock(); + mdr->xlocks.erase(lock); + mdr->locks.erase(lock); + dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; + + // slave? + if (!lock->get_parent()->is_auth()) { + mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()), + lock->get_parent()->authority().first, MDS_PORT_LOCKER); + } + + // others waiting? + if (lock->is_waiter_for(SimpleLock::WAIT_WR)) { + // wake 'em up + lock->finish_waiters(SimpleLock::WAIT_WR, 0); + } else { + // auto-sync if alone. + if (lock->get_parent()->is_auth() && + !lock->get_parent()->is_replicated() && + lock->get_state() != LOCK_SYNC) + lock->set_state(LOCK_SYNC); - assert(!in->is_auth()); + simple_eval(lock); } +} - dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; - - CLock *lock = &in->hardlock; + + +// dentry specific helpers + +// trace helpers + +/** dentry_can_rdlock_trace + * see if we can _anonymously_ rdlock an entire trace. + * if not, and req is specified, wait and retry that message. + */ +bool Locker::dentry_can_rdlock_trace(vector& trace, MClientRequest *req) +{ + // verify dentries are rdlockable. + // we do this because + // - we're being less aggressive about locks acquisition, and + // - we're not acquiring the locks in order! + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + CDentry *dn = *it; + if (!dn->lock.can_rdlock(0)) { + if (req) { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << ", waiting" << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << endl; + } + return false; + } + } + return true; +} + +void Locker::dentry_anon_rdlock_trace_start(vector& trace) +{ + // grab dentry rdlocks + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + (*it)->lock.get_rdlock(); +} + + +void Locker::dentry_anon_rdlock_trace_finish(vector& trace) +{ + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + simple_rdlock_finish(&(*it)->lock, 0); +} + + + +// ========================================================================== +// scatter lock + +bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) +{ + dout(7) << "scatter_rdlock_start on " << *lock + << " on " << *lock->get_parent() << endl; + + // pre-twiddle? + if (lock->get_state() == LOCK_SCATTER && + lock->get_parent()->is_auth() && + !lock->get_parent()->is_replicated() && + !lock->is_wrlocked()) + scatter_sync(lock); + + // can rdlock? + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } + + // wait for read. + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + + // initiate sync? + if (lock->get_state() == LOCK_SCATTER && + lock->get_parent()->is_auth()) + scatter_sync(lock); + + return false; +} + +void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr) +{ + dout(7) << "scatter_rdlock_finish on " << *lock + << " on " << *lock->get_parent() << endl; + lock->put_rdlock(); + if (mdr) { + mdr->rdlocks.erase(lock); + mdr->locks.erase(lock); + } - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - - { // assim data - int off = 0; - in->decode_hard_state(m->get_data(), off); + scatter_eval(lock); +} + + +bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) +{ + dout(7) << "scatter_wrlock_start on " << *lock + << " on " << *lock->get_parent() << endl; + + // pre-twiddle? + if (lock->get_state() == LOCK_SYNC && + lock->get_parent()->is_auth() && + !lock->get_parent()->is_replicated() && + !lock->is_rdlocked()) + scatter_scatter(lock); + + // can wrlock? + if (lock->can_wrlock()) { + lock->get_wrlock(); + mdr->wrlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } + + // wait for write. + lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); + + // initiate scatter? + if (lock->get_state() == LOCK_SYNC && + lock->get_parent()->is_auth()) + scatter_scatter(lock); + + return false; +} + +void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) +{ + dout(7) << "scatter_wrlock_finish on " << *lock + << " on " << *lock->get_parent() << endl; + lock->put_wrlock(); + if (mdr) { + mdr->wrlocks.erase(lock); + mdr->locks.erase(lock); + } + + scatter_eval(lock); +} + +void Locker::scatter_eval(ScatterLock *lock) +{ + if (!lock->get_parent()->is_auth()) { + // REPLICA + + if (lock->get_state() == LOCK_GSYNCS && + !lock->is_wrlocked()) { + dout(10) << "scatter_eval no wrlocks, acking sync" << endl; + bufferlist data; + lock->encode_locked_state(data); + mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), + lock->get_parent()->authority().first, MDS_PORT_LOCKER); + lock->set_state(LOCK_SYNC); } + + } else { + // AUTH - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply + // gsyncs -> sync? + if (lock->get_state() == LOCK_GSYNCS && + !lock->is_gathering() && + !lock->is_wrlocked()) { + dout(7) << "scatter_eval finished gather/un-wrlock on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD|SimpleLock::WAIT_NOLOCKS); + } - // waiters - in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE); - break; + // gscatters -> scatter? + if (lock->get_state() == LOCK_GSCATTERS && + !lock->is_rdlocked()) { + assert(lock->get_parent()->is_auth()); + if (lock->get_parent()->is_replicated()) { + // encode and bcast + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SCATTER, data); + } + + lock->set_state(LOCK_SCATTER); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + } - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); + // waiting for rd? + if (lock->get_state() == LOCK_SCATTER && + !lock->is_wrlocked() && + lock->is_waiter_for(SimpleLock::WAIT_RD)) { + dout(10) << "scatter_eval no wrlocks, read waiter, syncing" << endl; + scatter_sync(lock); + } - // wait for readers to finish? - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_GLOCKR); - in->add_waiter(CINODE_WAIT_HARDNORD, - new C_MDS_RetryMessage(mds,m)); - assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) - return; - } else { + // re-scatter? + if (lock->get_state() == LOCK_SYNC && + !lock->is_rdlocked()) { + dout(10) << "scatter_eval no rdlocks, scattering" << endl; + scatter_scatter(lock); + } + } +} - // update lock and reply - lock->set_state(LOCK_LOCK); - - { - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } + +void Locker::scatter_sync(ScatterLock *lock) +{ + dout(10) << "scatter_sync " << *lock + << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); + + if (lock->get_state() == LOCK_SYNC) return; + assert(lock->get_state() == LOCK_SCATTER); + + // bcast + if (lock->get_parent()->is_replicated()) { + send_lock_message(lock, LOCK_AC_SYNC); + lock->set_state(LOCK_GSYNCS); + lock->init_gather(); + } + else if (lock->is_wrlocked()) { + lock->set_state(LOCK_GSYNCS); + } else { + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + } +} + + +void Locker::scatter_scatter(ScatterLock *lock) +{ + dout(10) << "scatter_scatter " << *lock + << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); + + if (lock->get_state() == LOCK_SCATTER) return; + assert(lock->get_state() == LOCK_SYNC); + + if (lock->is_rdlocked()) { + lock->set_state(LOCK_GSCATTERS); + } else { + if (lock->get_parent()->is_replicated()) { + // encode and bcast + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SCATTER, data); + } + lock->set_state(LOCK_SCATTER); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + } +} + + + +void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) +{ + int from = m->get_asker(); + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_SCATTER); + + // wait for wrlocks to close? + if (lock->is_wrlocked()) { + dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_GSYNCS); + } else { + // encode and reply + bufferlist data; + lock->encode_locked_state(data); + mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), + from, MDS_PORT_LOCKER); } break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + case LOCK_AC_SCATTER: + assert(lock->get_state() == LOCK_SYNC); + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SCATTER); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + break; + + // -- for auth -- + case LOCK_AC_SYNCACK: + assert(lock->get_state() == LOCK_GSYNCS); + assert(lock->is_gathering(from)); + lock->remove_gather(from); + lock->decode_locked_state(m->get_data()); + + if (lock->is_gathering()) { + dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() + << " from " << from << ", still gathering " << lock->get_gather_set() + << endl; } else { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; - inode_hard_eval(in); + dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() + << " from " << from << ", last one" + << endl; + simple_eval(lock); } - } + break; + } + delete m; } - -// ===================== -// soft inode metadata +// ========================================================================== +// file lock -bool Locker::inode_file_read_start(CInode *in, MClientRequest *m) +bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) { - dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; + dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << endl; // can read? grab ref. - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); return true; } // can't read, and replicated. - if (in->filelock.can_read_soon(in->is_auth())) { + if (lock->can_rdlock_soon()) { // wait - dout(7) << "inode_file_read_start can_read_soon " << *in << endl; + dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << endl; } else { - if (in->is_auth()) { + if (lock->get_parent()->is_auth()) { // auth // FIXME or qsync? - if (in->filelock.is_stable()) { - inode_file_lock(in); // lock, bc easiest to back off - - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); + if (lock->is_stable()) { + file_lock(lock); // lock, bc easiest to back off ... FIXME + + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); + lock->finish_waiters(SimpleLock::WAIT_STABLE); return true; } } else { - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } else { // replica - if (in->filelock.is_stable()) { - + if (lock->is_stable()) { + // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; + CInode *in = (CInode*)lock->get_parent(); + int auth = in->authority().first; + dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); + mdcache->request_forward(mdr, auth); return false; } else { // wait until stable - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } } - + // wait - dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return false; } -void Locker::inode_file_read_finish(CInode *in) + +void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) { // drop ref - assert(in->filelock.can_read(in->is_auth())); - in->filelock.put_read(); + assert(lock->can_rdlock(mdr)); + lock->put_rdlock(); + mdr->rdlocks.erase(lock); + mdr->locks.erase(lock); - dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; + dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (in->filelock.get_nread() == 0) { - in->finish_waiting(CINODE_WAIT_FILENORD); - inode_file_eval(in); + if (!lock->is_rdlocked()) { + lock->finish_waiters(SimpleLock::WAIT_NOLOCKS); + file_eval(lock); } } -bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) +bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) { - dout(7) << "inode_file_write_start on " << *in << endl; + dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; + + assert(lock->get_parent()->is_auth()); // remote file xlock not implemented + + // already xlocked by me? + if (lock->get_xlocked_by() == mdr) + return true; // can't write? - if (!in->filelock.can_write(in->is_auth())) { - - // can't write. - if (in->is_auth()) { - // auth - if (!in->filelock.can_write_soon(in->is_auth())) { - if (!in->filelock.is_stable()) { - dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - // initiate lock - inode_file_lock(in); - - // fall-thru to below. + if (!lock->can_xlock(mdr)) { + + // auth + if (!lock->can_xlock_soon()) { + if (!lock->is_stable()) { + dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; + + // initiate lock + file_lock(lock); + + // fall-thru to below. } } // check again - if (in->filelock.can_write(in->is_auth())) { - // can i auth pin? - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); - in->filelock.get_write(m); + if (lock->can_xlock(mdr)) { + assert(lock->get_parent()->is_auth()); + lock->get_xlock(mdr); + mdr->locks.insert(lock); + mdr->xlocks.insert(lock); return true; } else { - dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } -void Locker::inode_file_write_finish(CInode *in) +void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) { // drop ref - //assert(in->filelock.can_write(in->is_auth())); - in->filelock.put_write(); - in->auth_unpin(); - dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; + assert(lock->can_xlock(mdr)); + lock->put_xlock(); + mdr->locks.erase(lock); + mdr->xlocks.erase(lock); + dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; + + assert(lock->get_parent()->is_auth()); // or implement remote xlocks // drop lock? - if (!in->is_filelock_write_wanted()) { - in->finish_waiting(CINODE_WAIT_FILENOWR); - inode_file_eval(in); - } + if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE)) + file_eval(lock); } @@ -1013,70 +1585,70 @@ void Locker::inode_file_write_finish(CInode *in) * - checks if soft state should change (eg bc last writer closed) */ -void Locker::inode_file_eval(CInode *in) +void Locker::file_eval(FileLock *lock) { + CInode *in = (CInode*)lock->get_parent(); + int issued = in->get_caps_issued(); // [auth] finished gather? if (in->is_auth() && - !in->filelock.is_stable() && - in->filelock.gather_set.size() == 0) { - dout(7) << "inode_file_eval finished mds gather on " << *in << endl; + !lock->is_stable() && + !lock->is_gathering()) { + dout(7) << "file_eval finished mds gather on " << *lock << " on " << *lock->get_parent() << endl; - switch (in->filelock.get_state()) { + switch (lock->get_state()) { // to lock case LOCK_GLOCKR: case LOCK_GLOCKM: case LOCK_GLOCKL: if ((issued & ~CAP_FILE_RDCACHE) == 0) { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); // waiters - in->filelock.get_read(); - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - //in->filelock.put_write(); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->put_rdlock(); } break; // to mixed case LOCK_GMIXEDR: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - in->finish_waiting(CINODE_WAIT_FILESTABLE); + lock->set_state(LOCK_MIXED); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; case LOCK_GMIXEDL: if ((issued & ~(CAP_FILE_WR)) == 0) { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); if (in->is_replicated()) { // data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_MIXED, softdata); } - in->finish_waiting(CINODE_WAIT_FILESTABLE); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; // to loner case LOCK_GLONERR: if (issued == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); + lock->set_state(LOCK_LONER); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; case LOCK_GLONERM: if ((issued & ~CAP_FILE_WR) == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); + lock->set_state(LOCK_LONER); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; @@ -1084,19 +1656,19 @@ void Locker::inode_file_eval(CInode *in) case LOCK_GSYNCL: case LOCK_GSYNCM: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); { // bcast data to replicas bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); } break; @@ -1109,27 +1681,25 @@ void Locker::inode_file_eval(CInode *in) // [replica] finished caps gather? if (!in->is_auth() && - !in->filelock.is_stable()) { - switch (in->filelock.get_state()) { + !lock->is_stable()) { + switch (lock->get_state()) { case LOCK_GMIXEDR: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); + MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); + mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); } break; case LOCK_GLOCKR: if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); // ack - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); + MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); + mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); } break; @@ -1139,58 +1709,58 @@ void Locker::inode_file_eval(CInode *in) } // !stable -> do nothing. - if (!in->filelock.is_stable()) return; + if (!lock->is_stable()) return; // stable. - assert(in->filelock.is_stable()); + assert(lock->is_stable()); if (in->is_auth()) { // [auth] int wanted = in->get_caps_wanted(); bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "inode_file_eval wanted=" << cap_string(wanted) - << " filelock=" << in->filelock + dout(7) << "file_eval wanted=" << cap_string(wanted) + << " filelock=" << *lock << " on " << *lock->get_parent() << " loner=" << loner << endl; // * -> loner? - if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && + if (!lock->is_rdlocked() && + !lock->is_waiter_for(SimpleLock::WAIT_WR) && (wanted & CAP_FILE_WR) && loner && - in->filelock.get_state() != LOCK_LONER) { - dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl; - inode_file_loner(in); + lock->get_state() != LOCK_LONER) { + dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << endl; + file_loner(lock); } // * -> mixed? - else if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && + else if (!lock->is_rdlocked() && + !lock->is_waiter_for(SimpleLock::WAIT_WR) && (wanted & CAP_FILE_RD) && (wanted & CAP_FILE_WR) && - !(loner && in->filelock.get_state() == LOCK_LONER) && - in->filelock.get_state() != LOCK_MIXED) { - dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; - inode_file_mixed(in); + !(loner && lock->get_state() == LOCK_LONER) && + lock->get_state() != LOCK_MIXED) { + dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << endl; + file_mixed(lock); } // * -> sync? - else if (!in->is_filelock_write_wanted() && - !(wanted & CAP_FILE_WR) && + else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && + !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && ((wanted & CAP_FILE_RD) || in->is_replicated() || - (!loner && in->filelock.get_state() == LOCK_LONER)) && - in->filelock.get_state() != LOCK_SYNC) { - dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; - inode_file_sync(in); + (!loner && lock->get_state() == LOCK_LONER)) && + lock->get_state() != LOCK_SYNC) { + dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << endl; + file_sync(lock); } // * -> lock? (if not replicated or open) else if (!in->is_replicated() && wanted == 0 && - in->filelock.get_state() != LOCK_LOCK) { - inode_file_lock(in); + lock->get_state() != LOCK_LOCK) { + file_lock(lock); } } else { @@ -1202,77 +1772,78 @@ void Locker::inode_file_eval(CInode *in) // mid -bool Locker::inode_file_sync(CInode *in) +bool Locker::file_sync(FileLock *lock) { - dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_SYNC || - in->filelock.get_state() == LOCK_GSYNCL || - in->filelock.get_state() == LOCK_GSYNCM) + if (lock->get_state() == LOCK_SYNC || + lock->get_state() == LOCK_GSYNCL || + lock->get_state() == LOCK_GSYNCM) return true; - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - if (in->filelock.get_state() == LOCK_LOCK) { + if (lock->get_state() == LOCK_LOCK) { if (in->is_replicated()) { // soft data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); // reissue caps issue_caps(in); return true; } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { // writers? if (issued & CAP_FILE_WR) { // gather client write caps - in->filelock.set_state(LOCK_GSYNCM); + lock->set_state(LOCK_GSYNCM); issue_caps(in); } else { // no writers, go straight to sync if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); + send_lock_message(lock, LOCK_AC_SYNC); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); } return false; } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { // writers? if (issued & CAP_FILE_WR) { // gather client write caps - in->filelock.set_state(LOCK_GSYNCL); + lock->set_state(LOCK_GSYNCL); issue_caps(in); } else { // no writers, go straight to sync if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); + send_lock_message(lock, LOCK_AC_SYNC); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); } return false; } @@ -1284,31 +1855,32 @@ bool Locker::inode_file_sync(CInode *in) -void Locker::inode_file_lock(CInode *in) +void Locker::file_lock(FileLock *lock) { - dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_LOCK || - in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL) + if (lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_GLOCKR || + lock->get_state() == LOCK_GLOCKM || + lock->get_state() == LOCK_GLOCKL) return; // lock or locking - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLOCKR); + lock->set_state(LOCK_GLOCKR); // call back caps if (issued) @@ -1316,22 +1888,22 @@ void Locker::inode_file_lock(CInode *in) } else { if (issued) { // call back caps - in->filelock.set_state(LOCK_GLOCKR); + lock->set_state(LOCK_GLOCKR); issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLOCKM); + lock->set_state(LOCK_GLOCKM); // call back caps issue_caps(in); @@ -1339,25 +1911,25 @@ void Locker::inode_file_lock(CInode *in) //assert(issued); // ??? -sage 2/19/06 if (issued) { // change lock - in->filelock.set_state(LOCK_GLOCKM); + lock->set_state(LOCK_GLOCKM); // call back caps issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { if (issued & CAP_FILE_WR) { // change lock - in->filelock.set_state(LOCK_GLOCKL); + lock->set_state(LOCK_GLOCKL); // call back caps issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } else @@ -1365,67 +1937,68 @@ void Locker::inode_file_lock(CInode *in) } -void Locker::inode_file_mixed(CInode *in) +void Locker::file_mixed(FileLock *lock) { - dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; + dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << endl; + CInode *in = (CInode*)lock->get_parent(); assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_GMIXEDR || - in->filelock.get_state() == LOCK_GMIXEDL) + if (lock->get_state() == LOCK_GMIXEDR || + lock->get_state() == LOCK_GMIXEDL) return; // mixed or mixing - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_MIXED); + lock->init_gather(); - in->filelock.set_state(LOCK_GMIXEDR); + lock->set_state(LOCK_GMIXEDR); issue_caps(in); } else { if (issued) { - in->filelock.set_state(LOCK_GMIXEDR); + lock->set_state(LOCK_GMIXEDR); issue_caps(in); } else { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); } } } - else if (in->filelock.get_state() == LOCK_LOCK) { + else if (lock->get_state() == LOCK_LOCK) { if (in->is_replicated()) { // data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_MIXED, softdata); } // change lock - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { if (issued & CAP_FILE_WRBUFFER) { // gather up WRBUFFER caps - in->filelock.set_state(LOCK_GMIXEDL); + lock->set_state(LOCK_GMIXEDL); issue_caps(in); } else if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.set_state(LOCK_MIXED); + send_lock_message(lock, LOCK_AC_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } else { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } } @@ -1435,52 +2008,53 @@ void Locker::inode_file_mixed(CInode *in) } -void Locker::inode_file_loner(CInode *in) +void Locker::file_loner(FileLock *lock) { - dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_LONER || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM) + if (lock->get_state() == LOCK_LONER || + lock->get_state() == LOCK_GLONERR || + lock->get_state() == LOCK_GLONERM) return; - assert(in->filelock.is_stable()); + assert(lock->is_stable()); assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLONERR); + lock->set_state(LOCK_GLONERR); } else { // only one guy with file open, who gets it all, so - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } } - else if (in->filelock.get_state() == LOCK_LOCK) { + else if (lock->get_state() == LOCK_LOCK) { // change lock. ignore replicas; they don't know about LONER. - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLONERM); + lock->set_state(LOCK_GLONERM); } else { - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } } @@ -1489,51 +2063,20 @@ void Locker::inode_file_loner(CInode *in) assert(0); } + + // messenger -void Locker::handle_lock_inode_file(MLock *m) +void Locker::handle_file_lock(FileLock *lock, MLock *m) { - assert(m->get_otype() == LOCK_OTYPE_IFILE); - if (mds->logger) mds->logger->inc("lif"); - CInode *in = mdcache->get_inode(m->get_ino()); + CInode *in = (CInode*)lock->get_parent(); int from = m->get_asker(); - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - } else { - // replica - if (!in) { - // drop it. don't nak. - dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; + dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " + << *in << " filelock=" << *lock << endl; - CLock *lock = &in->filelock; int issued = in->get_caps_issued(); switch (m->get_action()) { @@ -1542,21 +2085,16 @@ void Locker::handle_lock_inode_file(MLock *m) assert(lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_MIXED); - { // assim data - int off = 0; - in->decode_file_state(m->get_data(), off); - } - - // update lock + lock->decode_locked_state(m->get_data()); lock->set_state(LOCK_SYNC); // no need to reply. // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - inode_file_eval(in); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); + file_eval(lock); break; case LOCK_AC_LOCK: @@ -1565,13 +2103,12 @@ void Locker::handle_lock_inode_file(MLock *m) // call back caps? if (issued & CAP_FILE_RD) { - dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; + dout(7) << "handle_file_lock client readers, gathering caps on " << *in << endl; issue_caps(in); } - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILENORD, - new C_MDS_RetryMessage(mds,m)); + if (lock->is_rdlocked()) { + dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl; + in->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m)); lock->set_state(LOCK_GLOCKR); assert(0);// i am broken.. why retry message when state captures all the info i need? return; @@ -1584,9 +2121,8 @@ void Locker::handle_lock_inode_file(MLock *m) // nothing to wait for, lock and ack. { lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + + MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); mds->send_message_mds(reply, from, MDS_PORT_LOCKER); } break; @@ -1607,8 +2143,7 @@ void Locker::handle_lock_inode_file(MLock *m) lock->set_state(LOCK_MIXED); // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); mds->send_message_mds(reply, from, MDS_PORT_LOCKER); } } else { @@ -1621,10 +2156,8 @@ void Locker::handle_lock_inode_file(MLock *m) issue_caps(in); // waiters - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); - inode_file_eval(in); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + file_eval(lock); break; @@ -1632,25 +2165,27 @@ void Locker::handle_lock_inode_file(MLock *m) // -- auth -- case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR || - lock->state == LOCK_GLOCKM || - lock->state == LOCK_GLONERM || - lock->state == LOCK_GLONERR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + assert(lock->get_state() == LOCK_GLOCKR || + lock->get_state() == LOCK_GLOCKM || + lock->get_state() == LOCK_GLONERM || + lock->get_state() == LOCK_GLONERR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; case LOCK_AC_SYNCACK: - assert(lock->state == LOCK_GSYNCM); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); + assert(lock->get_state() == LOCK_GSYNCM); + assert(lock->is_gathering(from)); + lock->remove_gather(from); /* not used currently { @@ -1660,24 +2195,28 @@ void Locker::handle_lock_inode_file(MLock *m) } */ - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; case LOCK_AC_MIXEDACK: - assert(lock->state == LOCK_GMIXEDR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); + assert(lock->get_state() == LOCK_GMIXEDR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; @@ -1694,553 +2233,3 @@ void Locker::handle_lock_inode_file(MLock *m) - - - - - - - - -void Locker::handle_lock_dir(MLock *m) -{ - -} - - - -// DENTRY - -bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) -{ - dout(7) << "dentry_xlock_start on " << *dn << endl; - - // locked? - if (dn->lockstate == DN_LOCK_XLOCK) { - if (dn->xlockedby == m) return true; // locked by me! - - // not by me, wait - dout(7) << "dentry " << *dn << " xlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // prelock? - if (dn->lockstate == DN_LOCK_PREXLOCK) { - if (dn->xlockedby == m) { - dout(7) << "dentry " << *dn << " prexlock by me" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } else { - dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } - return false; - } - - - // lockable! - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING); - - // dir auth pinnable? - if (!dn->dir->can_auth_pin()) { - dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // is dentry path pinned? - if (dn->is_pinned()) { - dout(7) << "dentry " << *dn << " pinned, waiting" << endl; - dn->lockstate = DN_LOCK_UNPINNING; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // pin path up to dentry! (if success, point of no return) - CDentry *pdn = dn->dir->inode->get_parent_dn(); - if (pdn) { - if (mdcache->active_requests[m].traces.count(pdn)) { - dout(7) << "already path pinned parent dentry " << *pdn << endl; - } else { - dout(7) << "pinning parent dentry " << *pdn << endl; - vector trace; - mdcache->make_trace(trace, pdn->inode); - assert(trace.size()); - - if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false; - - mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; - } - } - - // pin dir! - dn->dir->auth_pin(); - - // mine! - dn->xlockedby = m; - - if (dn->is_replicated()) { - dn->lockstate = DN_LOCK_PREXLOCK; - - // xlock with whom? - set who; - for (map::iterator p = dn->replicas_begin(); - p != dn->replicas_end(); - ++p) - who.insert(p->first); - dn->gather_set = who; - - // make path - string path; - dn->make_path(path); - dout(10) << "path is " << path << " for " << *dn << endl; - - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - m->set_path(path); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - - // wait - dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds, m, ref)); - return false; - } else { - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - return true; - } -} - -void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) -{ - dout(7) << "dentry_xlock_finish on " << *dn << endl; - - assert(dn->xlockedby); - if (dn->xlockedby == DN_XLOCK_FOREIGN) { - dout(7) << "this was a foreign xlock" << endl; - } else { - // remove from request record - assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1); - mdcache->active_requests[dn->xlockedby].xlocks.erase(dn); - } - - dn->xlockedby = 0; - dn->lockstate = DN_LOCK_SYNC; - - // unpin parent dir? - // -> no? because we might have xlocked 2 things in this dir. - // instead, we let request_finish clean up the mess. - - // tell replicas? - if (!quiet) { - // tell even if dn is null. - if (dn->is_replicated()) { - send_lock_message(dn, LOCK_AC_SYNC); - } - } - - // unpin dir - dn->dir->auth_unpin(); - - // kick waiters - list finished; - dn->dir->take_waiting(CDIR_WAIT_DNREAD, finished); - mds->queue_finished(finished); -} - - -/* - * onfinish->finish() will be called with - * 0 on successful xlock, - * -1 on failure - */ - -class C_MDC_XlockRequest : public Context { - Locker *mdc; - CDir *dir; - string dname; - Message *req; - Context *finisher; -public: - C_MDC_XlockRequest(Locker *mdc, - CDir *dir, string& dname, - Message *req, - Context *finisher) { - this->mdc = mdc; - this->dir = dir; - this->dname = dname; - this->req = req; - this->finisher = finisher; - } - - void finish(int r) { - mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher); - } -}; - -void Locker::dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher) -{ - dout(10) << "dentry_xlock_request_finish r = " << r << endl; - if (r == 1) { // 1 for xlock request success - CDentry *dn = dir->lookup(dname); - if (dn && dn->xlockedby == 0) { - // success - dn->xlockedby = req; // our request was the winner - dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl; - - // remember! - mdcache->active_requests[req].foreign_xlocks.insert(dn); - } - } - - // retry request (or whatever) - finisher->finish(0); - delete finisher; -} - -void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish) -{ - dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; - // send request - int dauth = dir->dentry_authority(dname); - MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); - m->set_dn(dir->ino(), dname); - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - - // add waiter - dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname, - new C_MDC_XlockRequest(this, - dir, dname, req, - onfinish)); -} - - - - -void Locker::handle_lock_dn(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_DN); - - CInode *diri = mdcache->get_inode(m->get_ino()); // may be null - CDir *dir = 0; - if (diri) dir = diri->dir; // may be null - string dname = m->get_dn(); - int from = m->get_asker(); - CDentry *dn = 0; - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - - // normally we have it always - if (diri && dir) { - int dauth = dir->dentry_authority(dname); - assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, - m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak - m->get_action() == LOCK_AC_REQXLOCKNAK); - - if (dir->is_proxy()) { - - assert(dauth >= 0); - - if (dauth == m->get_asker() && - (m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC)) { - dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; - - // forward - if (mdcache->active_requests.count(m)) { - // xlock requests are requests, use request_* functions! - assert(m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC); - // forward as a request - mdcache->request_forward(m, dauth, MDS_PORT_LOCKER); - } else { - // not an xlock req, or it is and we just didn't register the request yet - // forward normally - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - } - return; - } - - dn = dir->lookup(dname); - } - - // except with.. an xlock request? - if (!dn) { - assert(dir); // we should still have the dir, though! the requester has the dir open. - switch (m->get_action()) { - - case LOCK_AC_LOCK: - dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl; - dn = dir->add_dentry(dname); - break; - - case LOCK_AC_REQXLOCK: - // send nak - if (dir->state_test(CDIR_STATE_DELETED)) { - dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl; - } - { - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(m->get_path()); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - - // finish request (if we got that far) - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - - delete m; - return; - - case LOCK_AC_REQXLOCKC: - dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl; - break; - - default: - assert(0); - } - } - } else { - // replica - if (dir) dn = dir->lookup(dname); - if (!dn) { - dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl; - - if (m->get_action() == LOCK_AC_REQXLOCKACK || - m->get_action() == LOCK_AC_REQXLOCKNAK) { - dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl; - //assert(0); // how can this happen? tell me now! - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - if (m->get_action() == LOCK_AC_LOCK) { - if (0) { // not anymore - dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl; - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - if (1) { - // NAK - MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid()); - reply->set_dn(m->get_ino(), dname); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - } else { - dout(7) << "safely ignoring." << endl; - delete m; - } - return; - } - - assert(dn); - } - - if (dn) { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl; - } else { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl; - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_LOCK: - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING || - dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover! - - if (dn->is_pinned()) { - dn->lockstate = DN_LOCK_UNPINNING; - - // wait - dout(7) << "dn pinned, waiting " << *dn << endl; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryMessage(mds, m)); - return; - } else { - dn->lockstate = DN_LOCK_XLOCK; - dn->xlockedby = 0; - - // ack now - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_dn(diri->ino(), dname); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ? - break; - - case LOCK_AC_SYNC: - assert(dn->lockstate == DN_LOCK_XLOCK); - dn->lockstate = DN_LOCK_SYNC; - dn->xlockedby = 0; - - // null? hose it. - if (dn->is_null()) { - dout(7) << "hosing null (and now sync) dentry " << *dn << endl; - dir->remove_dentry(dn); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out - break; - - case LOCK_AC_REQXLOCKACK: - case LOCK_AC_REQXLOCKNAK: - { - dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl; - list finished; - dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY! - finish_contexts(finished, - (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - case LOCK_AC_LOCKNAK: - assert(dn->gather_set.count(from) == 1); - dn->gather_set.erase(from); - if (dn->gather_set.size() == 0) { - dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); - } - break; - - - case LOCK_AC_REQXLOCKC: - // make sure it's a _file_, if it exists. - if (dn && dn->inode && dn->inode->is_dir()) { - dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl; - - // nak - string path; - dn->make_path(path); - - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // done - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - case LOCK_AC_REQXLOCK: - if (dn) { - dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl; - } - - - // start request? - if (!mdcache->active_requests.count(m)) { - vector trace; - if (!mdcache->request_start(m, dir->inode, trace)) - return; // waiting for pin - } - - // try to xlock! - if (!dn) { - assert(m->get_action() == LOCK_AC_REQXLOCKC); - dn = dir->add_dentry(dname); - } - - if (dn->xlockedby != m) { - if (!dentry_xlock_start(dn, m, dir->inode)) { - // hose null dn if we're waiting on something - if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); - return; // waiting for xlock - } - } else { - // successfully xlocked! on behalf of requestor. - string path; - dn->make_path(path); - - dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl; - - // ACK xlock request - MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // note: keep request around in memory (to hold the xlock/pins on behalf of requester) - return; - } - break; - - case LOCK_AC_UNXLOCK: - dout(7) << "handle_lock_dn unxlock on " << *dn << endl; - { - string dname = dn->name; - Message *m = dn->xlockedby; - - // finish request - mdcache->request_finish(m); // this will drop the locks (and unpin paths!) - return; - } - break; - - default: - assert(0); - } - - delete m; -} - - - - - - - diff --git a/trunk/ceph/mds/Locker.h b/trunk/ceph/mds/Locker.h index d8dcb2c541a37..fabf3fc512042 100644 --- a/trunk/ceph/mds/Locker.h +++ b/trunk/ceph/mds/Locker.h @@ -43,6 +43,9 @@ class MClientRequest; class Anchor; class Capability; +class SimpleLock; +class FileLock; +class ScatterLock; class Locker { private: @@ -53,43 +56,63 @@ private: Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} void dispatch(Message *m); + void handle_lock(MLock *m); - void send_lock_message(CInode *in, int msg, int type); - void send_lock_message(CInode *in, int msg, int type, bufferlist& data); - void send_lock_message(CDentry *dn, int msg); + void send_lock_message(SimpleLock *lock, int msg); + void send_lock_message(SimpleLock *lock, int msg, bufferlist &data); // -- locks -- - // high level interface - public: - bool inode_hard_read_try(CInode *in, Context *con); - bool inode_hard_read_start(CInode *in, MClientRequest *m); - void inode_hard_read_finish(CInode *in); - bool inode_hard_write_start(CInode *in, MClientRequest *m); - void inode_hard_write_finish(CInode *in); - bool inode_file_read_start(CInode *in, MClientRequest *m); - void inode_file_read_finish(CInode *in); - bool inode_file_write_start(CInode *in, MClientRequest *m); - void inode_file_write_finish(CInode *in); - - void inode_hard_eval(CInode *in); - void inode_file_eval(CInode *in); + bool acquire_locks(MDRequest *mdr, + set &rdlocks, + set &wrlocks, + set &xlocks); + + bool rdlock_start(SimpleLock *lock, MDRequest *mdr); + void rdlock_finish(SimpleLock *lock, MDRequest *mdr); + bool xlock_start(SimpleLock *lock, MDRequest *mdr); + void xlock_finish(SimpleLock *lock, MDRequest *mdr); + bool wrlock_start(SimpleLock *lock, MDRequest *mdr); + void wrlock_finish(SimpleLock *lock, MDRequest *mdr); + + // simple + void handle_simple_lock(SimpleLock *lock, MLock *m); + void simple_eval(SimpleLock *lock); + void simple_sync(SimpleLock *lock); + void simple_lock(SimpleLock *lock); + bool simple_rdlock_try(SimpleLock *lock, Context *con); + bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); + void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); + bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); + void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); + + bool dentry_can_rdlock_trace(vector& trace, MClientRequest *req); + void dentry_anon_rdlock_trace_start(vector& trace); + void dentry_anon_rdlock_trace_finish(vector& trace); + + // scatter + void handle_scatter_lock(ScatterLock *lock, MLock *m); + void scatter_eval(ScatterLock *lock); + void scatter_sync(ScatterLock *lock); + void scatter_scatter(ScatterLock *lock); + bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); + void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); + bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); + void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); + + // file + void handle_file_lock(FileLock *lock, MLock *m); + void file_eval(FileLock *lock); + bool file_sync(FileLock *lock); + void file_lock(FileLock *lock); + void file_mixed(FileLock *lock); + void file_loner(FileLock *lock); + bool file_rdlock_try(FileLock *lock, Context *con); + bool file_rdlock_start(FileLock *lock, MDRequest *mdr); + void file_rdlock_finish(FileLock *lock, MDRequest *mdr); + bool file_xlock_start(FileLock *lock, MDRequest *mdr); + void file_xlock_finish(FileLock *lock, MDRequest *mdr); + - protected: - void inode_hard_mode(CInode *in, int mode); - void inode_file_mode(CInode *in, int mode); - - // low level triggers - void inode_hard_sync(CInode *in); - void inode_hard_lock(CInode *in); - bool inode_file_sync(CInode *in); - void inode_file_lock(CInode *in); - void inode_file_mixed(CInode *in); - void inode_file_loner(CInode *in); - - // messengers - void handle_lock(MLock *m); - void handle_lock_inode_hard(MLock *m); - void handle_lock_inode_file(MLock *m); // -- file i/o -- public: @@ -104,23 +127,6 @@ private: void handle_inode_file_caps(class MInodeFileCaps *m); - // dirs - void handle_lock_dir(MLock *m); - - // dentry locks - public: - bool dentry_xlock_start(CDentry *dn, - Message *m, CInode *ref); - void dentry_xlock_finish(CDentry *dn, bool quiet=false); - void handle_lock_dn(MLock *m); - void dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish); - void dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher); - - }; diff --git a/trunk/ceph/mds/LogEvent.cc b/trunk/ceph/mds/LogEvent.cc index 4a83902c5c6c4..fa9c57c1a76a2 100644 --- a/trunk/ceph/mds/LogEvent.cc +++ b/trunk/ceph/mds/LogEvent.cc @@ -17,16 +17,24 @@ // events i know of #include "events/EString.h" + +#include "events/ESession.h" +#include "events/EClientMap.h" #include "events/EImportMap.h" -#include "events/EMetaBlob.h" +#include "events/EExport.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" + #include "events/EUpdate.h" -#include "events/EUnlink.h" +#include "events/ESlaveUpdate.h" +#include "events/EOpen.h" + #include "events/EAlloc.h" #include "events/EPurgeFinish.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" + +#include "events/EAnchor.h" +#include "events/EAnchorClient.h" + LogEvent *LogEvent::decode(bufferlist& bl) { @@ -44,16 +52,24 @@ LogEvent *LogEvent::decode(bufferlist& bl) // create event LogEvent *le; switch (type) { - case EVENT_STRING: le = new EString(); break; + case EVENT_STRING: le = new EString; break; + + case EVENT_SESSION: le = new ESession; break; + case EVENT_CLIENTMAP: le = new EClientMap; break; case EVENT_IMPORTMAP: le = new EImportMap; break; - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_UNLINK: le = new EUnlink(); break; - case EVENT_PURGEFINISH: le = new EPurgeFinish(); break; - case EVENT_ALLOC: le = new EAlloc(); break; - case EVENT_EXPORTSTART: le = new EExportStart; break; - case EVENT_EXPORTFINISH: le = new EExportFinish; break; + case EVENT_EXPORT: le = new EExport; break; case EVENT_IMPORTSTART: le = new EImportStart; break; case EVENT_IMPORTFINISH: le = new EImportFinish; break; + + case EVENT_UPDATE: le = new EUpdate; break; + case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; + case EVENT_OPEN: le = new EOpen; break; + + case EVENT_ALLOC: le = new EAlloc; break; + case EVENT_PURGEFINISH: le = new EPurgeFinish; break; + + case EVENT_ANCHOR: le = new EAnchor; break; + case EVENT_ANCHORCLIENT: le = new EAnchorClient; break; default: dout(1) << "uh oh, unknown log event type " << type << endl; assert(0); diff --git a/trunk/ceph/mds/LogEvent.h b/trunk/ceph/mds/LogEvent.h index 6895ed54074d4..cb0febe07b567 100644 --- a/trunk/ceph/mds/LogEvent.h +++ b/trunk/ceph/mds/LogEvent.h @@ -16,25 +16,23 @@ #define EVENT_STRING 1 -#define EVENT_INODEUPDATE 2 -#define EVENT_DIRUPDATE 3 +#define EVENT_SESSION 7 +#define EVENT_CLIENTMAP 8 +#define EVENT_IMPORTMAP 2 +#define EVENT_EXPORT 30 +#define EVENT_IMPORTSTART 31 +#define EVENT_IMPORTFINISH 32 -#define EVENT_IMPORTMAP 4 -#define EVENT_UPDATE 5 +#define EVENT_UPDATE 3 +#define EVENT_SLAVEUPDATE 4 +#define EVENT_OPEN 5 #define EVENT_ALLOC 10 -#define EVENT_MKNOD 11 -#define EVENT_MKDIR 12 -#define EVENT_LINK 13 - -#define EVENT_UNLINK 20 -#define EVENT_RMDIR 21 #define EVENT_PURGEFINISH 22 -#define EVENT_EXPORTSTART 30 -#define EVENT_EXPORTFINISH 31 -#define EVENT_IMPORTSTART 32 -#define EVENT_IMPORTFINISH 33 +#define EVENT_ANCHOR 40 +#define EVENT_ANCHORCLIENT 41 + diff --git a/trunk/ceph/mds/MDBalancer.cc b/trunk/ceph/mds/MDBalancer.cc index 57e79dcdf51fc..9006a147f9609 100644 --- a/trunk/ceph/mds/MDBalancer.cc +++ b/trunk/ceph/mds/MDBalancer.cc @@ -70,6 +70,7 @@ void MDBalancer::tick() // balance? if (true && mds->get_nodeid() == 0 && + g_conf.mds_bal_interval > 0 && (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) && @@ -133,19 +134,21 @@ void MDBalancer::send_heartbeat() mds_load_t load = get_load(); mds_load[ mds->get_nodeid() ] = load; - // import_map + // import_map -- how much do i import from whom map import_map; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); + set authsubs; + mds->mdcache->get_auth_subtrees(authsubs); + for (set::iterator it = authsubs.begin(); + it != authsubs.end(); it++) { CDir *im = *it; - if (im->inode->is_root()) continue; - int from = im->inode->authority(); + int from = im->inode->authority().first; + if (from == mds->get_nodeid()) continue; + if (im->get_inode()->is_stray()) continue; import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); } mds_import_map[ mds->get_nodeid() ] = import_map; - + dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl; for (map::iterator it = import_map.begin(); @@ -207,15 +210,17 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) void MDBalancer::export_empties() { dout(5) << "export_empties checking for empty imports" << endl; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); + dout(0) << "IMPLEMENT ME" << endl; + /* + for (set::iterator it = mds->mdcache->subtrees.begin(); + it != mds->mdcache->subtrees.end(); it++) { CDir *dir = *it; if (!dir->inode->is_root() && dir->get_size() == 0) mds->mdcache->migrator->export_empty_import(dir); } + */ } @@ -259,12 +264,14 @@ void MDBalancer::do_hashing() inodeno_t dirino = *i; CInode *in = mds->mdcache->get_inode(dirino); if (!in) continue; + /* CDir *dir = in->dir; if (!dir) continue; if (!dir->is_auth()) continue; dout(0) << "do_hashing hashing " << *dir << endl; mds->mdcache->migrator->hash_dir(dir); + */ } hash_queue.clear(); } @@ -288,7 +295,8 @@ void MDBalancer::do_rebalance(int beat) if (mds_load[whoami].mds_load() > 0) { load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load() + << " <- " << mds_load[whoami].root.meta_load() + << " / " << mds_load[whoami].mds_load() << endl; } @@ -417,23 +425,29 @@ void MDBalancer::do_rebalance(int beat) // make a sorted list of my imports map import_pop_map; multimap import_from_map; - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); + set fullauthsubs; + + mds->mdcache->get_fullauth_subtrees(fullauthsubs); + for (set::iterator it = fullauthsubs.begin(); + it != fullauthsubs.end(); it++) { - if ((*it)->is_hashed()) continue; - double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load(); + CDir *im = *it; + if (im->get_inode()->is_stray()) continue; + + double pop = im->popularity[MDS_POP_CURDOM].meta_load(); if (pop < g_conf.mds_bal_idle_threshold && - (*it)->inode != mds->mdcache->get_root()) { - dout(-5) << " exporting idle import " << **it - << " back to mds" << (*it)->inode->authority() + im->inode != mds->mdcache->get_root() && + im->inode->authority().first != mds->get_nodeid()) { + dout(-5) << " exporting idle import " << *im + << " back to mds" << im->inode->authority().first << endl; - mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority()); + mds->mdcache->migrator->export_dir(im, im->inode->authority().first); continue; } - import_pop_map[ pop ] = *it; - int from = (*it)->inode->authority(); - dout(15) << " map: i imported " << **it << " from " << from << endl; - import_from_map.insert(pair(from, *it)); + import_pop_map[ pop ] = im; + int from = im->inode->authority().first; + dout(15) << " map: i imported " << *im << " from " << from << endl; + import_from_map.insert(pair(from, im)); } @@ -482,10 +496,9 @@ void MDBalancer::do_rebalance(int beat) multimap::iterator plast = p.first++; if (dir->inode->is_root()) continue; - if (dir->is_hashed()) continue; if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy + assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy if (pop <= amount-have) { dout(-5) << "reexporting " << *dir @@ -521,7 +534,7 @@ void MDBalancer::do_rebalance(int beat) << " back to mds" << imp->inode->authority() << endl; have += pop; - mds->mdcache->migrator->export_dir(imp, imp->inode->authority()); + mds->mdcache->migrator->export_dir(imp, imp->inode->authority().first); } if (amount-have < MIN_OFFLOAD) break; } @@ -532,13 +545,15 @@ void MDBalancer::do_rebalance(int beat) } // okay, search for fragments of my workload - set candidates = mds->mdcache->imports; + set candidates; + mds->mdcache->get_fullauth_subtrees(candidates); list exports; for (set::iterator pot = candidates.begin(); pot != candidates.end(); pot++) { + if ((*pot)->get_inode()->is_stray()) continue; find_exports(*pot, amount, exports, have, already_exporting); if (have > amount-MIN_OFFLOAD) { break; @@ -593,33 +608,38 @@ void MDBalancer::find_exports(CDir *dir, CInode *in = it->second->get_inode(); if (!in) continue; if (!in->is_dir()) continue; - if (!in->dir) continue; // clearly not popular - if (in->dir->is_export()) continue; - if (in->dir->is_hashed()) continue; - if (already_exporting.count(in->dir)) continue; - - if (in->dir->is_frozen()) continue; // can't export this right now! - //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! - - // how popular? - double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load(); - dir_sum += pop; - dout(20) << " pop " << pop << " " << *in->dir << endl; - - if (pop < minchunk) continue; + list dfls; + in->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); + p != dfls.end(); + ++p) { + CDir *dir = *p; + if (!dir->is_auth()) continue; + if (already_exporting.count(dir)) continue; + + if (dir->is_frozen()) continue; // can't export this right now! + //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! + + // how popular? + double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); + dir_sum += pop; + dout(20) << " pop " << pop << " " << *dir << endl; - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(in->dir); - have += pop; - return; + if (pop < minchunk) continue; + + // lucky find? + if (pop > needmin && pop < needmax) { + exports.push_back(dir); + have += pop; + return; + } + + if (pop > need) + bigger.push_back(dir); + else + smaller.insert(pair(pop, dir)); } - - if (pop > need) - bigger.push_back(in->dir); - else - smaller.insert(pair(pop, in->dir)); } dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; @@ -713,7 +733,6 @@ void MDBalancer::hit_dir(CDir *dir, int type) if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && - !(dir->is_hashed() || dir->is_hashing()) && hash_queue.count(dir->ino()) == 0) { dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; hash_queue.insert(dir->ino()); @@ -746,7 +765,7 @@ void MDBalancer::hit_recursive(CDir *dir, int type) dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; - dir->dir_rep = CDIR_REP_ALL; + dir->dir_rep = CDir::REP_ALL; mds->mdcache->send_dir_updates(dir, true); dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); @@ -759,7 +778,7 @@ void MDBalancer::hit_recursive(CDir *dir, int type) // unreplicate dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; - dir->dir_rep = CDIR_REP_NONE; + dir->dir_rep = CDir::REP_NONE; mds->mdcache->send_dir_updates(dir); } } @@ -783,7 +802,7 @@ void MDBalancer::hit_recursive(CDir *dir, int type) in->popularity[MDS_POP_CURDOM].pop[type].hit(); } - if (dir->is_import()) + if (dir->is_subtree_root()) curdom = false; // end of auth domain, stop hitting auth counters. dir = dir->inode->get_parent_dir(); } @@ -797,7 +816,7 @@ void MDBalancer::subtract_export(CDir *dir) { meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - bool in_domain = !dir->is_import(); + bool in_domain = !dir->is_subtree_root(); while (true) { CInode *in = dir->inode; @@ -808,7 +827,7 @@ void MDBalancer::subtract_export(CDir *dir) dir = in->get_parent_dir(); if (!dir) break; - if (dir->is_import()) in_domain = false; + if (dir->is_subtree_root()) in_domain = false; dir->popularity[MDS_POP_ANYDOM] -= curdom; if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; @@ -820,7 +839,7 @@ void MDBalancer::add_import(CDir *dir) { meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - bool in_domain = !dir->is_import(); + bool in_domain = !dir->is_subtree_root(); while (true) { CInode *in = dir->inode; @@ -831,7 +850,7 @@ void MDBalancer::add_import(CDir *dir) dir = in->get_parent_dir(); if (!dir) break; - if (dir->is_import()) in_domain = false; + if (dir->is_subtree_root()) in_domain = false; dir->popularity[MDS_POP_ANYDOM] += curdom; if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; @@ -846,7 +865,7 @@ void MDBalancer::add_import(CDir *dir) void MDBalancer::show_imports(bool external) { - mds->mdcache->show_imports(); + mds->mdcache->show_subtrees(); } diff --git a/trunk/ceph/mds/MDCache.cc b/trunk/ceph/mds/MDCache.cc index eb8ad591d6a35..ae8a90edb4adb 100644 --- a/trunk/ceph/mds/MDCache.cc +++ b/trunk/ceph/mds/MDCache.cc @@ -14,7 +14,6 @@ #include "MDCache.h" -#include "MDStore.h" #include "MDS.h" #include "Server.h" #include "Locker.h" @@ -22,7 +21,6 @@ #include "MDBalancer.h" #include "AnchorClient.h" #include "Migrator.h" -#include "Renamer.h" #include "MDSMap.h" @@ -39,15 +37,15 @@ #include "osdc/Filer.h" #include "events/EImportMap.h" +#include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" #include "events/EString.h" -#include "events/EUnlink.h" #include "events/EPurgeFinish.h" #include "messages/MGenericMessage.h" #include "messages/MMDSImportMap.h" #include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" #include "messages/MDiscover.h" #include "messages/MDiscoverReply.h" @@ -91,12 +89,11 @@ MDCache::MDCache(MDS *m) { mds = m; migrator = new Migrator(mds, this); - renamer = new Renamer(mds, this); + // renamer = new Renamer(mds, this); root = NULL; lru.lru_set_max(g_conf.mds_cache_size); lru.lru_set_midpoint(g_conf.mds_cache_mid); - did_shutdown_exports = false; did_shutdown_log_cap = false; shutdown_commits = 0; } @@ -104,7 +101,7 @@ MDCache::MDCache(MDS *m) MDCache::~MDCache() { delete migrator; - delete renamer; + //delete renamer; } @@ -130,14 +127,15 @@ bool MDCache::shutdown() if (lru.lru_get_size() > 0) { dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; //show_cache(); - show_imports(); + show_subtrees(); //dump(); } return true; } -// MDCache +// ==================================================================== +// some inode functions CInode *MDCache::create_inode() { @@ -157,12 +155,6 @@ CInode *MDCache::create_inode() return in; } -void MDCache::destroy_inode(CInode *in) -{ - mds->idalloc->reclaim_id(in->ino()); - remove_inode(in); -} - void MDCache::add_inode(CInode *in) { @@ -174,18 +166,636 @@ void MDCache::add_inode(CInode *in) void MDCache::remove_inode(CInode *o) { dout(14) << "remove_inode " << *o << endl; + if (o->get_parent_dn()) { // FIXME: multiple parents? CDentry *dn = o->get_parent_dn(); assert(!dn->is_dirty()); - if (dn->is_sync()) - dn->dir->remove_dentry(dn); // unlink inode AND hose dentry - else - dn->dir->unlink_inode(dn); // leave dentry + dn->dir->unlink_inode(dn); // leave dentry ... FIXME? + } + + // remove from inode map + inode_map.erase(o->ino()); + + // delete it + delete o; + + if (o == root) root = 0; + if (o == stray) stray = 0; +} + + + +CInode *MDCache::create_root_inode() +{ + CInode *root = new CInode(this); + memset(&root->inode, 0, sizeof(inode_t)); + root->inode.ino = MDS_INO_ROOT; + + // make it up (FIXME) + root->inode.mode = 0755 | INODE_MODE_DIR; + root->inode.size = 0; + root->inode.ctime = + root->inode.mtime = g_clock.now(); + + root->inode.nlink = 1; + root->inode.layout = g_OSD_MDDirLayout; + + set_root( root ); + add_inode( root ); + + return root; +} + + +void MDCache::open_root(Context *c) +{ + int whoami = mds->get_nodeid(); + + // open root inode + if (whoami == 0) { + // i am root inode + CInode *root = create_root_inode(); + + // root directory too + CDir *dir = root->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(dir, 0); + dir->dir_rep = CDir::REP_ALL; //NONE; + + show_subtrees(); + + if (c) { + c->finish(0); + delete c; + } + } else { + // request inode from root mds + if (waiting_for_root.empty()) { + dout(7) << "discovering root" << endl; + + filepath want; + MDiscover *req = new MDiscover(whoami, + MDS_INO_ROOT, + want, + false); // there _is_ no base dir for the root inode + mds->send_message_mds(req, 0, MDS_PORT_CACHE); + } else { + dout(7) << "waiting for root" << endl; + } + + // wait + waiting_for_root.push_back(c); + + } +} + +CInode *MDCache::create_stray_inode(int whose) +{ + if (whose < 0) whose = mds->get_nodeid(); + stray = new CInode(this, whose == mds->get_nodeid()); + memset(&stray->inode, 0, sizeof(inode_t)); + stray->inode.ino = MDS_INO_STRAY(whose); + + // make it up (FIXME) + stray->inode.mode = 0755 | INODE_MODE_DIR; + stray->inode.size = 0; + stray->inode.ctime = + stray->inode.mtime = g_clock.now(); + + stray->inode.nlink = 1; + stray->inode.layout = g_OSD_MDDirLayout; + + add_inode( stray ); + + return stray; +} + +void MDCache::open_local_stray() +{ + create_stray_inode(); + CDir *dir = stray->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(dir, mds->get_nodeid()); +} + +void MDCache::open_foreign_stray(int who, Context *c) +{ + inodeno_t ino = MDS_INO_STRAY(who); + dout(10) << "open_foreign_stray mds" << who << " " << ino << endl; + assert(!have_inode(ino)); + + // discover + filepath want; + MDiscover *req = new MDiscover(who, + ino, + want, + false); + mds->send_message_mds(req, 0, MDS_PORT_CACHE); + + // wait + waiting_for_stray[ino].push_back(c); +} + + + + + + +// ==================================================================== +// subtree management + +/* + * adjust the dir_auth of a subtree. + * merge with parent and/or child subtrees, if is it appropriate. + * merge can ONLY happen if both parent and child have unambiguous auth. + */ +void MDCache::adjust_subtree_auth(CDir *dir, pair auth) +{ + dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir << endl; + + show_subtrees(); + + CDir *root; + if (dir->ino() < MDS_INO_BASE) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) + subtrees[root].clear(); + } else { + root = get_subtree_root(dir); // subtree root + } + assert(root); + assert(subtrees.count(root)); + dout(7) << " current root is " << *root << endl; + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << endl; + assert(subtrees.count(dir) == 0); + subtrees[dir].clear(); // create empty subtree bounds list for me. + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set::iterator next = p; + next++; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << endl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + } + + // adjust export pins + adjust_export_state(dir); + for (set::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + adjust_export_state(*p); + + show_subtrees(); +} + + +/* + * any "export" point must be pinned in cache to ensure a proper + * chain of delegation. we do this by pinning when a dir is nonauth + * but the inode is auth. + * + * import points don't need to be pinned the same way simply because the + * exporting mds is pinning the exprot (as above) thus the dir is + * always open on the importer. + */ +void MDCache::adjust_export_state(CDir *dir) +{ + // be auth bit agnostic, so that we work during recovery + // (before recalc_auth_bits) + if (dir->authority().first != mds->get_nodeid() && + dir->inode->authority().first == mds->get_nodeid()) { + // export. + if (!dir->state_test(CDir::STATE_EXPORT)) { + dout(10) << "adjust_export_state pinning new export " << *dir << endl; + dir->state_set(CDir::STATE_EXPORT); + dir->get(CDir::PIN_EXPORT); + } + } + else { + // not export. + if (dir->state_test(CDir::STATE_EXPORT)) { + dout(10) << "adjust_export_state unpinning old export " << *dir << endl; + dir->state_clear(CDir::STATE_EXPORT); + dir->put(CDir::PIN_EXPORT); + } + } +} + +void MDCache::try_subtree_merge(CDir *dir) +{ + dout(7) << "try_subtree_merge " << *dir << endl; + assert(subtrees.count(dir)); + set oldbounds = subtrees[dir]; + + // try merge at my root + try_subtree_merge_at(dir); + + // try merge at my old bounds + for (set::iterator p = oldbounds.begin(); + p != oldbounds.end(); + ++p) + try_subtree_merge_at(*p); + +} + +void MDCache::try_subtree_merge_at(CDir *dir) +{ + dout(10) << "try_subtree_merge_at " << *dir << endl; + assert(subtrees.count(dir)); + + // merge with parent? + CDir *parent = dir; + if (dir->ino() >= MDS_INO_BASE) + parent = get_subtree_root(dir->get_parent_dir()); + + if (parent != dir && // we have a parent, + parent->dir_auth == dir->dir_auth && // auth matches, + dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, + !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, + // merge with parent. + dout(10) << " subtree merge at " << *dir << endl; + dir->set_dir_auth(CDIR_AUTH_DEFAULT); + + // move our bounds under the parent + for (set::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + subtrees[parent].insert(*p); + + // we are no longer a subtree or bound + subtrees.erase(dir); + subtrees[parent].erase(dir); + } + + show_subtrees(15); +} + + +void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir + << " bounds " << bounds + << endl; + + show_subtrees(); + + CDir *root; + if (dir->ino() < MDS_INO_BASE) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) + subtrees[root].clear(); + } else { + root = get_subtree_root(dir); // subtree root + } + assert(root); + assert(subtrees.count(root)); + dout(7) << " current root is " << *root << endl; + + pair oldauth = dir->authority(); + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << endl; + assert(subtrees.count(dir) == 0); + subtrees[dir].clear(); // create empty subtree bounds list for me. + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set::iterator next = p; + next++; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << endl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + } + + // verify/adjust bounds. + // - these may be new, or + // - beneath existing ambiguous bounds (which will be collapsed), + // - but NOT beneath unambiguous bounds. + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bound = *p; + + // new bound? + if (subtrees[dir].count(bound) == 0) { + if (get_subtree_root(bound) == dir) { + dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << endl; + adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. + } + else { + dout(10) << " want bound " << *bound << endl; + // make sure it's nested beneath ambiguous subtree(s) + while (1) { + CDir *t = get_subtree_root(bound->get_parent_dir()); + if (t == dir) break; + while (subtrees[dir].count(t) == 0) + t = get_subtree_root(t->get_parent_dir()); + dout(10) << " swallowing intervening subtree at " << *t << endl; + adjust_subtree_auth(t, auth); + try_subtree_merge_at(t); + } + } + } + else { + dout(10) << " already have bound " << *bound << endl; + } + } + // merge stray bounds? + set::iterator p = subtrees[dir].begin(); + while (p != subtrees[dir].end()) { + set::iterator n = p; + n++; + if (bounds.count(*p) == 0) { + CDir *stray = *p; + dout(10) << " swallowing extra subtree at " << *stray << endl; + adjust_subtree_auth(stray, auth); + try_subtree_merge_at(stray); + } + p = n; } - inode_map.erase(o->ino()); // remove from map + + // bound should now match. + verify_subtree_bounds(dir, bounds); + + show_subtrees(); } +void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir + << " bound_dfs " << bound_dfs + << endl; + + // make bounds list + set bounds; + for (list::iterator p = bound_dfs.begin(); + p != bound_dfs.end(); + ++p) { + CDir *bd = get_dirfrag(*p); + if (bd) + bounds.insert(bd); + } + + adjust_bounded_subtree_auth(dir, bounds, auth); +} + + + +CDir *MDCache::get_subtree_root(CDir *dir) +{ + // find the underlying dir that delegates (or is about to delegate) auth + while (true) { + if (dir->is_subtree_root()) + return dir; + dir = dir->get_parent_dir(); + if (!dir) + return 0; // none + } +} + +void MDCache::remove_subtree(CDir *dir) +{ + dout(10) << "remove_subtree " << *dir << endl; + assert(subtrees.count(dir)); + assert(subtrees[dir].empty()); + subtrees.erase(dir); + if (dir->get_parent_dir()) { + CDir *p = get_subtree_root(dir->get_parent_dir()); + assert(subtrees[p].count(dir)); + subtrees[p].erase(dir); + } +} + +void MDCache::get_subtree_bounds(CDir *dir, set& bounds) +{ + assert(subtrees.count(dir)); + bounds = subtrees[dir]; +} + +void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) +{ + if (subtrees.count(dir)) { + // just copy them, dir is a subtree. + get_subtree_bounds(dir, bounds); + } else { + // find them + CDir *root = get_subtree_root(dir); + for (set::iterator p = subtrees[root].begin(); + p != subtrees[root].end(); + ++p) { + CDir *t = *p; + while (t != root) { + t = t->get_parent_dir(); + assert(t); + if (t == dir) { + bounds.insert(*p); + continue; + } + } + } + } +} + +void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) +{ + // for debugging only. + assert(subtrees.count(dir)); + if (bounds != subtrees[dir]) { + dout(0) << "verify_subtree_bounds failed" << endl; + set b = bounds; + for (set::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) { + if (bounds.count(*p)) { + b.erase(*p); + continue; + } + dout(0) << " missing bound " << **p << endl; + } + for (set::iterator p = b.begin(); + p != b.end(); + ++p) + dout(0) << " extra bound " << **p << endl; + } + assert(bounds == subtrees[dir]); +} + +void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) +{ + // for debugging only. + assert(subtrees.count(dir)); + + // make sure that any bounds i do have are properly noted as such. + int failed = 0; + for (list::const_iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = get_dirfrag(*p); + if (!bd) continue; + if (subtrees[dir].count(bd) == 0) { + dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << endl; + failed++; + } + } + assert(failed == 0); +} + +void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) +{ + dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << endl; + + list dfls; + diri->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { + CDir *dir = *p; + + CDir *oldparent = get_subtree_root(olddir); + CDir *newparent = get_subtree_root(diri->get_parent_dir()); + + if (oldparent == newparent) { + dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << endl; + continue; + } + + if (dir->is_subtree_root()) { + // children are fine. change parent. + dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << endl; + assert(subtrees[oldparent].count(dir)); + subtrees[oldparent].erase(dir); + assert(subtrees.count(newparent)); + subtrees[newparent].insert(dir); + } else { + // mid-subtree. + + // see if any old bounds move to the new parent. + for (set::iterator p = subtrees[oldparent].begin(); + p != subtrees[oldparent].end(); + ++p) { + CDir *bound = *p; + CDir *broot = get_subtree_root(bound->get_parent_dir()); + if (broot != oldparent) { + assert(broot == newparent); + dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << endl; + subtrees[oldparent].erase(broot); + subtrees[newparent].insert(broot); + } + } + + // did auth change? + if (oldparent->authority() != newparent->authority()) + adjust_subtree_auth(dir, oldparent->authority()); // caller is responsible for *diri. + } + } + +} + + +void MDCache::get_fullauth_subtrees(set& s) +{ + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *root = p->first; + if (root->is_full_dir_auth()) + s.insert(root); + } +} +void MDCache::get_auth_subtrees(set& s) +{ + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *root = p->first; + if (root->is_auth()) + s.insert(root); + } +} + + +// count. + +int MDCache::num_subtrees() +{ + return subtrees.size(); +} + +int MDCache::num_subtrees_fullauth() +{ + int n = 0; + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *root = p->first; + if (root->is_full_dir_auth()) + n++; + } + return n; +} + +int MDCache::num_subtrees_fullnonauth() +{ + int n = 0; + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *root = p->first; + if (root->is_full_dir_nonauth()) + n++; + } + return n; +} + + + + + + + +// ==================================================================== +// import map, recovery /* * take note of where we write import_maps in the log, as we need @@ -205,34 +815,34 @@ public: }; - void MDCache::log_import_map(Context *onsync) { - dout(10) << "log_import_map " << imports.size() << " imports, " - << exports.size() << " exports" << endl; + dout(10) << "log_import_map " << num_subtrees() << " subtrees" + << num_subtrees_fullauth() << " fullauth" + << endl; EImportMap *le = new EImportMap; + + // include all auth subtrees, and their bounds. + // and a spanning tree to tie it to the root. + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (!dir->is_auth()) continue; - // include import/export inodes, - // and a spanning tree to tie it to the root of the fs - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; - le->imports.insert(im->ino()); - le->metablob.add_dir_context(im, true); - le->metablob.add_dir(im, false); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - le->nested_exports[im->ino()].insert(ex->ino()); - le->exports.insert(ex->ino()); - le->metablob.add_dir_context(ex); - le->metablob.add_dir(ex, false); - } + le->imports.insert(dir->dirfrag()); + le->metablob.add_dir_context(dir, true); + le->metablob.add_dir(dir, false); + + // bounds + for (set::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CDir *bound = *q; + le->bounds[dir->dirfrag()].insert(bound->dirfrag()); + le->metablob.add_dir_context(bound); + le->metablob.add_dir(bound, false); } } @@ -244,11 +854,19 @@ void MDCache::log_import_map(Context *onsync) } +void MDCache::send_import_map(int who) +{ + if (migrator->is_exporting()) + send_import_map_later(who); + else + send_import_map_now(who); +} - - -// ===================== -// recovery stuff +void MDCache::send_import_map_later(int who) +{ + dout(10) << "send_import_map_later to mds" << who << endl; + wants_import_map.insert(who); +} void MDCache::send_pending_import_maps() { @@ -256,11 +874,12 @@ void MDCache::send_pending_import_maps() return; // nothing to send. // only if it's appropriate! - if (migrator->is_exporting()) { - dout(7) << "send_pending_import_maps waiting, exports still in progress" << endl; + if (migrator->is_exporting() || + migrator->is_importing()) { + dout(7) << "send_pending_import_maps waiting, imports/exports still in progress" << endl; return; // not now } - + // ok, send them. for (set::iterator p = wants_import_map.begin(); p != wants_import_map.end(); @@ -269,55 +888,191 @@ void MDCache::send_pending_import_maps() wants_import_map.clear(); } -void MDCache::send_import_map(int who) -{ - if (migrator->is_exporting()) - send_import_map_later(who); - else - send_import_map_now(who); +void MDCache::send_import_map_now(int who) +{ + dout(10) << "send_import_map_now to mds" << who << endl; + MMDSImportMap *m = new MMDSImportMap; + + show_subtrees(); + + // known + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) { + CDir *dir = p->first; + + // only our subtrees + if (dir->authority().first != mds->get_nodeid()) + continue; + + if (migrator->is_importing(dir->dirfrag())) { + // ambiguous (mid-import) + m->add_ambiguous_import(dir->dirfrag(), + migrator->get_import_bound_inos(dir->dirfrag())); + } else { + // not ambiguous. + m->add_import(dir->dirfrag()); + + // bounds too + for (set::iterator q = subtrees[dir].begin(); + q != subtrees[dir].end(); + ++q) { + CDir *bound = *q; + m->add_import_export(dir->dirfrag(), bound->dirfrag()); + } + } + } + + // ambiguous + for (map >::iterator p = my_ambiguous_imports.begin(); + p != my_ambiguous_imports.end(); + ++p) + m->add_ambiguous_import(p->first, p->second); + + // send + mds->send_message_mds(m, who, MDS_PORT_CACHE); +} + + +void MDCache::handle_mds_failure(int who) +{ + dout(7) << "handle_mds_failure mds" << who << endl; + + // make note of recovery set + mds->mdsmap->get_recovery_mds_set(recovery_set); + recovery_set.erase(mds->get_nodeid()); + dout(1) << "my recovery peers will be " << recovery_set << endl; + + // adjust my recovery lists + wants_import_map.erase(who); // MDS will ask again + got_import_map.erase(who); // i'll get another. + rejoin_ack_gather.erase(who); // i'll need/get another. + + // adjust subtree auth + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + // only if we are a _bystander_. + if (dir->dir_auth.first == who && + dir->dir_auth.second >= 0 && + dir->dir_auth.second != mds->get_nodeid()) { + dout(7) << "disambiguating auth for " << *dir << endl; + adjust_subtree_auth(dir, dir->dir_auth.second); + try_subtree_merge(dir); + } + else if (dir->dir_auth.second == who && + dir->dir_auth.first != mds->get_nodeid()) { + dout(7) << "disambiguating auth for " << *dir << endl; + adjust_subtree_auth(dir, dir->dir_auth.first); + try_subtree_merge(dir); + } + } + + // tell the migrator too. + migrator->handle_mds_failure(who); + + // kick any dir discovers that are waiting + hash_map >::iterator p = dir_discovers.begin(); + while (p != dir_discovers.end()) { + hash_map >::iterator n = p; + n++; + + // waiting on this mds? + if (p->second.count(who)) { + CInode *in = get_inode(p->first); + assert(in); + + // take waiters + list waiters; + in->take_waiting(CInode::WAIT_DIR, waiters); + mds->queue_waiters(waiters); + dout(10) << "kicking WAIT_DIR on " << *in << endl; + + // remove from mds list + p->second.erase(who); + if (p->second.empty()) + dir_discovers.erase(p); + } + p = n; + } + + // clean up any slave requests from this node + list ls; + for (hash_map::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) + if (p->second->by_mds == who) + ls.push_back(p->second); + while (!ls.empty()) { + dout(10) << "cleaning up slave request " << *ls.front() << endl; + request_finish(ls.front()); + ls.pop_front(); + } + + show_subtrees(); } -void MDCache::send_import_map_now(int who) +/* + * handle_mds_recovery - called on another node's transition + * from resolve -> active. + */ +void MDCache::handle_mds_recovery(int who) { - dout(10) << "send_import_map to mds" << who << endl; - - MMDSImportMap *m = new MMDSImportMap; + dout(7) << "handle_mds_recovery mds" << who << endl; - // known - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; + list waiters; - if (migrator->is_importing(im->ino())) { - // ambiguous (mid-import) - m->add_ambiguous_import(im->ino(), - migrator->get_import_bounds(im->ino())); - } else { - // not ambiguous. - m->add_import(im->ino()); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - m->add_import_export(im->ino(), ex->ino()); + // wake up any waiters in their subtrees + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + if (dir->authority().first != who) continue; + assert(!dir->is_auth()); + + // wake any waiters + list q; + q.push_back(dir); + + while (!q.empty()) { + CDir *d = q.front(); + q.pop_front(); + d->take_waiting(CDir::WAIT_ANY, waiters); + + // inode waiters too + for (CDir_map_t::iterator p = d->items.begin(); + p != d->items.end(); + ++p) { + CDentry *dn = p->second; + if (dn->is_primary()) { + dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); + + // recurse? + list ls; + dn->get_inode()->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *subdir = *p; + if (!subdir->is_subtree_root()) + q.push_back(subdir); + } } } } } - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - // second - mds->send_message_mds(m, who, MDS_PORT_CACHE); + // queue them up. + mds->queue_waiters(waiters); } +void MDCache::set_recovery_set(set& s) +{ + dout(7) << "set_recovery_set " << s << endl; + recovery_set = s; +} /* @@ -331,55 +1086,74 @@ void MDCache::handle_import_map(MMDSImportMap *m) dout(7) << "handle_import_map from " << m->get_source() << endl; int from = m->get_source().num(); - // FIXME: check if we are a surviving ambiguous importer - // update my dir_auth values - for (map >::iterator pi = m->imap.begin(); + for (map >::iterator pi = m->imap.begin(); pi != m->imap.end(); ++pi) { - CInode *imi = get_inode(pi->first); - if (!imi) continue; - CDir *im = imi->dir; - if (!im) continue; - - im->set_dir_auth(from); - - for (set::iterator pe = pi->second.begin(); - pe != pi->second.end(); - ++pe) { - CInode *exi = get_inode(*pe); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(CDIR_AUTH_UNKNOWN); + CDir *im = get_dirfrag(pi->first); + if (im) { + adjust_bounded_subtree_auth(im, pi->second, from); + try_subtree_merge(im); } } - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imap.begin(); - pi != m->ambiguous_imap.end(); - ++pi) - mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second ); + // am i a surviving ambiguous importer? + if (mds->is_active() || mds->is_stopping()) { + // check for any import success/failure (from this node) + map >::iterator p = my_ambiguous_imports.begin(); + while (p != my_ambiguous_imports.end()) { + map >::iterator n = p; + n++; + CDir *dir = get_dirfrag(p->first); + assert(dir); + dout(10) << "checking ambiguous import " << *dir << endl; + assert(migrator->is_importing(dir->dirfrag())); + assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); + if (migrator->get_import_peer(dir->dirfrag()) == from) { + if (dir->is_ambiguous_dir_auth()) { + dout(7) << "ambiguous import succeeded on " << *dir << endl; + migrator->import_finish(dir, true); // don't wait for log flush + } else { + dout(7) << "ambiguous import failed on " << *dir << endl; + migrator->import_reverse(dir, false); // don't adjust dir_auth. + } + my_ambiguous_imports.erase(p); + } + p = n; + } + } - // did i get them all? - got_import_map.insert(from); - - if (got_import_map == recovery_set) { - dout(10) << "got all import maps, ready to rejoin" << endl; - disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); + show_subtrees(); - // move to rejoin state - mds->set_want_state(MDSMap::STATE_REJOIN); - - } else { - dout(10) << "still waiting for more importmaps, got " << got_import_map - << ", need " << recovery_set << endl; + + // recovering? + if (!mds->is_rejoin() && !mds->is_active() && !mds->is_stopping()) { + // note ambiguous imports too.. unless i'm already active + for (map >::iterator pi = m->ambiguous_imap.begin(); + pi != m->ambiguous_imap.end(); + ++pi) { + dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl; + other_ambiguous_imports[from][pi->first].swap( pi->second ); + } + + // did i get them all? + got_import_map.insert(from); + + if (got_import_map == recovery_set) { + dout(10) << "got all import maps, done resolving subtrees" << endl; + disambiguate_imports(); + recalc_auth_bits(); + trim_non_auth(); + + // reconnect clients + mds->set_want_state(MDSMap::STATE_RECONNECT); + + } else { + dout(10) << "still waiting for more importmaps, got " << got_import_map + << ", need " << recovery_set << endl; + } } - + delete m; } @@ -388,38 +1162,28 @@ void MDCache::disambiguate_imports() { dout(10) << "disambiguate_imports" << endl; + // FIXME what about surviving bystanders + // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.begin(); + for (map > >::iterator p = other_ambiguous_imports.begin(); + p != other_ambiguous_imports.end(); ++p) { int who = p->first; + dout(10) << "ambiguous imports for mds" << who << endl; - for (map >::iterator q = p->second.begin(); + for (map >::iterator q = p->second.begin(); q != p->second.end(); ++q) { - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; + dout(10) << " ambiguous import " << q->first << " bounds " << q->second << endl; + CDir *dir = get_dirfrag(q->first); if (!dir) continue; - if (dir->authority() >= CDIR_AUTH_UNKNOWN) { - dout(10) << "mds" << who << " did not import " << *dir << endl; - } else { + if (dir->authority().first == CDIR_AUTH_UNKNOWN) { dout(10) << "mds" << who << " did import " << *dir << endl; - int was = dir->authority(); - dir->set_dir_auth(who); - - for (set::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CInode *exi = get_inode(q->first); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(was); - dout(10) << " bound " << *ex << endl; - } + adjust_bounded_subtree_auth(dir, q->second, who); + try_subtree_merge(dir); + } else { + dout(10) << "mds" << who << " did not import " << *dir << endl; } } } @@ -427,14 +1191,12 @@ void MDCache::disambiguate_imports() // my ambiguous imports while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); + map >::iterator q = my_ambiguous_imports.begin(); - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; + CDir *dir = get_dirfrag(q->first); if (!dir) continue; - if (dir->authority() != CDIR_AUTH_UNKNOWN) { + if (dir->authority().first != CDIR_AUTH_UNKNOWN) { dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; cancel_ambiguous_import(q->first); } else { @@ -444,166 +1206,119 @@ void MDCache::disambiguate_imports() } assert(my_ambiguous_imports.empty()); - show_imports(); + show_subtrees(); } -void MDCache::cancel_ambiguous_import(inodeno_t dirino) -{ - assert(my_ambiguous_imports.count(dirino)); - dout(10) << "cancel_ambiguous_import " << dirino - << " bounds " << my_ambiguous_imports[dirino] - << endl; - my_ambiguous_imports.erase(dirino); -} -void MDCache::finish_ambiguous_import(inodeno_t dirino) +void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) { - assert(my_ambiguous_imports.count(dirino)); - set bounds; - bounds.swap(my_ambiguous_imports[dirino]); - my_ambiguous_imports.erase(dirino); - - dout(10) << "finish_ambiguous_import " << dirino - << " bounds " << bounds - << endl; - - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - // adjust dir_auth - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. adding to existing import. - im = get_auth_container(dir); - if (!im) im = dir; - nested_exports[im].erase(dir); - exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } + assert(my_ambiguous_imports.count(base) == 0); + my_ambiguous_imports[base].swap( bounds ); +} - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - // bounds (exports, before) - for (set::iterator p = bounds.begin(); +void MDCache::add_ambiguous_import(CDir *base, const set& bounds) +{ + // make a list + list binos; + for (set::iterator p = bounds.begin(); p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = nested_exports[bd].begin(); - q != nested_exports[bd].end(); - ++q) - nested_exports[im].insert(*q); - nested_exports.erase(bd); + ++p) + binos.push_back((*p)->dirfrag()); + + // note: this can get called twice if the exporter fails during recovery + if (my_ambiguous_imports.count(base->dirfrag())) + my_ambiguous_imports.erase(base->dirfrag()); - } else { - // not me anymore. now an export. - exports.insert(bd); - nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; - } + add_ambiguous_import(base->dirfrag(), binos); } -void MDCache::finish_ambiguous_export(inodeno_t dirino, set& bounds) +void MDCache::cancel_ambiguous_import(dirfrag_t df) { - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); + assert(my_ambiguous_imports.count(df)); + dout(10) << "cancel_ambiguous_import " << df + << " bounds " << my_ambiguous_imports[df] + << endl; + my_ambiguous_imports.erase(df); +} + +void MDCache::finish_ambiguous_import(dirfrag_t df) +{ + assert(my_ambiguous_imports.count(df)); + list bound_inos; + bound_inos.swap(my_ambiguous_imports[df]); + my_ambiguous_imports.erase(df); - dout(10) << "finish_ambiguous_export " << dirino - << " bounds " << bounds + dout(10) << "finish_ambiguous_import " << df + << " bounds " << bound_inos << endl; + CDir *dir = get_dirfrag(df); + assert(dir); - // adjust dir_auth - CDir *im = get_auth_container(dir); - if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) { - // was an import, hose it - assert(im == dir); - assert(imports.count(dir)); - imports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); - } else { - // i'm now an export - exports.insert(dir); - nested_exports[im].insert(dir); - dir->set_dir_auth( CDIR_AUTH_UNKNOWN ); // not me - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); - } - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds (there were exports, before) - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - // hose export - assert(exports.count(bd)); - exports.erase(bd); - nested_exports[im].erase(bd); - - // fix dir_auth - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me + // adjust dir_auth, import maps + adjust_bounded_subtree_auth(dir, bound_inos, mds->get_nodeid()); + try_subtree_merge(dir); +} - bd->state_clear(CDIR_STATE_EXPORT); - bd->put(CDir::PIN_EXPORT); - dout(10) << " bound " << *bd << endl; - } - - show_imports(); -} +/* + * once subtree auth is disambiguated, we need to adjust all the + * auth (and dirty) bits in our cache before moving on. + */ +void MDCache::recalc_auth_bits() +{ + dout(7) << "recalc_auth_bits" << endl; + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + CInode *in = p->second; + if (in->authority().first == mds->get_nodeid()) + in->state_set(CInode::STATE_AUTH); + else { + in->state_clear(CInode::STATE_AUTH); + if (in->is_dirty()) + in->mark_clean(); + } + if (in->parent) { + if (in->parent->authority().first == mds->get_nodeid()) + in->parent->state_set(CDentry::STATE_AUTH); + else { + in->parent->state_clear(CDentry::STATE_AUTH); + if (in->parent->is_dirty()) + in->parent->mark_clean(); + } + } + list ls; + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *dir = *p; + if (dir->authority().first == mds->get_nodeid()) + dir->state_set(CDir::STATE_AUTH); + else { + dir->state_clear(CDir::STATE_AUTH); + if (dir->is_dirty()) + dir->mark_clean(); + } + } + } + show_subtrees(); + show_cache(); +} /* * rejoin phase! * we start out by sending rejoins to everyone in the recovery set. * - * if _were_ are rejoining, send for all regions in our cache. + * if we are rejoin, send for all regions in our cache. * if we are active|stopping, send only to nodes that are are rejoining. */ void MDCache::send_cache_rejoins() { - dout(10) << "send_cache_rejoins " << endl; + dout(10) << "send_cache_rejoins with recovery_set " << recovery_set << endl; map rejoins; @@ -615,38 +1330,27 @@ void MDCache::send_cache_rejoins() if (*p == mds->get_nodeid()) continue; // nothing to myself! if (mds->is_rejoin() || mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin; + rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_REJOIN); } - // build list of dir_auth regions - list dir_auth_regions; - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); + assert(!migrator->is_importing()); + assert(!migrator->is_exporting()); + + // check all subtrees + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); ++p) { - if (!p->second->is_dir()) continue; - if (!p->second->dir) continue; - if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue; + CDir *dir = p->first; + assert(dir->is_subtree_root()); + assert(!dir->is_ambiguous_dir_auth()); - int auth = p->second->dir->get_dir_auth(); + int auth = dir->get_dir_auth().first; assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - if (rejoins.count(auth) == 0) - continue; // don't care about this node's regions - - // add to list - dout(10) << " on mds" << auth << " region " << *p->second << endl; - dir_auth_regions.push_back(p->second->dir); - } + if (auth == mds->get_nodeid()) continue; // skip my own regions! + if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - // walk the regions - for (list::iterator p = dir_auth_regions.begin(); - p != dir_auth_regions.end(); - ++p) { - CDir *dir = *p; - int to = dir->authority(); - cache_rejoin_walk(dir, rejoins[to]); + cache_rejoin_walk(dir, rejoins[auth]); } // send the messages @@ -659,19 +1363,22 @@ void MDCache::send_cache_rejoins() } // nothing? - if (rejoins.empty()) { + if (mds->is_rejoin() && rejoins.empty()) { dout(10) << "nothing to rejoin, going active" << endl; mds->set_want_state(MDSMap::STATE_ACTIVE); } } - void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) { dout(10) << "cache_rejoin_walk " << *dir << endl; - rejoin->add_dir(dir->ino()); + //if (mds->is_rejoin()) + rejoin->add_weak_dirfrag(dir->dirfrag()); + //else + //rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce()); + list nested; // finish this dir, then do nested items // walk dentries @@ -679,18 +1386,52 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) p != dir->items.end(); ++p) { // dentry - rejoin->add_dentry(dir->ino(), p->first); + CDentry *dn = p->second; + if (mds->is_rejoin()) + rejoin->add_weak_dentry(dir->dirfrag(), p->first); + else { + rejoin->add_strong_dentry(dir->dirfrag(), p->first, + dn->get_replica_nonce(), + dn->lock.get_state()); + if (dn->lock.is_xlocked()) + rejoin->add_dentry_xlock(dir->dirfrag(), p->first, + dn->lock.get_xlocked_by()->reqid); + } // inode? - if (p->second->is_primary() && p->second->get_inode()) { - CInode *in = p->second->get_inode(); - rejoin->add_inode(in->ino(), - in->get_caps_wanted()); + if (dn->is_primary() && dn->get_inode()) { + CInode *in = dn->get_inode(); + if (mds->is_rejoin() && in->get_caps_wanted() == 0) + rejoin->add_weak_inode(in->ino()); + else { + rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), + in->get_caps_wanted(), + in->authlock.get_state(), + in->linklock.get_state(), + in->dirfragtreelock.get_state(), + in->filelock.get_state()); + if (in->authlock.is_xlocked()) + rejoin->add_inode_xlock(in->ino(), in->authlock.get_type(), + in->authlock.get_xlocked_by()->reqid); + if (in->linklock.is_xlocked()) + rejoin->add_inode_xlock(in->ino(), in->linklock.get_type(), + in->linklock.get_xlocked_by()->reqid); + if (in->dirfragtreelock.is_xlocked()) + rejoin->add_inode_xlock(in->ino(), in->dirfragtreelock.get_type(), + in->dirfragtreelock.get_xlocked_by()->reqid); + if (in->filelock.is_xlocked()) + rejoin->add_inode_xlock(in->ino(), in->filelock.get_type(), + in->filelock.get_xlocked_by()->reqid); + } - // dir? - if (in->dir && - in->dir->get_dir_auth() == CDIR_AUTH_PARENT) - nested.push_back(in->dir); + // dirfrags in this subtree? + list dfs; + in->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); + p != dfs.end(); + ++p) + if (!(*p)->is_subtree_root()) + nested.push_back(*p); } } @@ -704,7 +1445,6 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) /* * i got a rejoin. - * * - reply with the lockstate * * if i am active|stopping, @@ -712,20 +1452,49 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) */ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) { - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; + dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() << endl; + + switch (m->op) { + case MMDSCacheRejoin::OP_REJOIN: + handle_cache_rejoin_rejoin(m); + break; + + case MMDSCacheRejoin::OP_ACK: + handle_cache_rejoin_ack(m); + break; + + case MMDSCacheRejoin::OP_MISSING: + handle_cache_rejoin_missing(m); + break; + + case MMDSCacheRejoin::OP_FULL: + handle_cache_rejoin_full(m); + break; + + default: + assert(0); + } + delete m; +} + +void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) +{ int from = m->get_source().num(); - MMDSCacheRejoinAck *ack = new MMDSCacheRejoinAck; + // do immediate ack? + MMDSCacheRejoin *ack = 0; + MMDSCacheRejoin *missing = 0; if (mds->is_active() || mds->is_stopping()) { - dout(10) << "removing stale cache replicas" << endl; + dout(10) << "i am active. removing stale cache replicas" << endl; + // first, scour cache of replica references for (hash_map::iterator p = inode_map.begin(); p != inode_map.end(); ++p) { // inode CInode *in = p->second; - if (in->is_replica(from) && m->inodes.count(p->first) == 0) { + if (in->is_replica(from) && m->weak_inodes.count(p->first) == 0) { inode_remove_replica(in, from); dout(10) << " rem " << *in << endl; } @@ -734,132 +1503,411 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) if (in->parent) { CDentry *dn = in->parent; if (dn->is_replica(from) && - (m->dentries.count(dn->get_dir()->ino()) == 0 || - m->dentries[dn->get_dir()->ino()].count(dn->get_name()) == 0)) { + (m->weak_dentries.count(dn->get_dir()->dirfrag()) == 0 || + m->weak_dentries[dn->get_dir()->dirfrag()].count(dn->get_name()) == 0)) { dn->remove_replica(from); dout(10) << " rem " << *dn << endl; } } // dir - if (in->dir) { - CDir *dir = in->dir; - if (dir->is_replica(from) && m->dirs.count(p->first) == 0) { + list dfs; + in->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); + p != dfs.end(); + ++p) { + CDir *dir = *p; + if (dir->is_replica(from) && m->weak_dirfrags.count(dir->dirfrag()) == 0) { dir->remove_replica(from); dout(10) << " rem " << *dir << endl; } } } - } else { - assert(mds->is_rejoin()); + + // do immediate ack. + ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); + } + + // dirs + for (set::iterator p = m->weak_dirfrags.begin(); + p != m->weak_dirfrags.end(); + ++p) { + CDir *dir = get_dirfrag(*p); + if (dir) { + int nonce = dir->add_replica(from); + dout(10) << " have " << *dir << endl; + if (ack) + ack->add_strong_dirfrag(*p, nonce); + + // dentries + for (set::iterator q = m->weak_dentries[*p].begin(); + q != m->weak_dentries[*p].end(); + ++q) { + CDentry *dn = dir->lookup(*q); + if (dn) { + int nonce = dn->add_replica(from); + dout(10) << " have " << *dn << endl; + ack->add_strong_dentry(*p, *q, dn->lock.get_state(), nonce); + } else { + dout(10) << " missing " << *p << " " << *q << endl; + if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); + missing->add_weak_dentry(*p, *q); + } + if (ack) + ack->add_strong_dentry(*p, *q, nonce, dn->lock.get_state()); + } + } else { + dout(10) << " missing " << *p << endl; + if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); + missing->add_weak_dirfrag(*p); + + // dentries + for (set::iterator q = m->weak_dentries[*p].begin(); + q != m->weak_dentries[*p].end(); + ++q) + missing->add_weak_dentry(*p, *q); + } + } + + // inodes + for (set::iterator p = m->weak_inodes.begin(); + p != m->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + if (in) { + int nonce = in->add_replica(from); + in->mds_caps_wanted.erase(from); + in->authlock.remove_gather(from); // just in case + in->linklock.remove_gather(from); // just in case + in->dirfragtreelock.remove_gather(from); // just in case + in->filelock.remove_gather(from); // just in case + dout(10) << " have (weak) " << *in << endl; + if (ack) + ack->add_strong_inode(in->ino(), + nonce, + 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state()); + } else { + dout(10) << " missing " << *p << endl; + if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); + missing->add_weak_inode(*p); + } + } + + // strong inodes too? + for (map::iterator p = m->strong_inodes.begin(); + p != m->strong_inodes.end(); + ++p) { + CInode *in = get_inode(p->first); + if (in) { + int nonce = in->add_replica(from); + if (p->second.caps_wanted) + in->mds_caps_wanted[from] = p->second.caps_wanted; + else + in->mds_caps_wanted.erase(from); + in->authlock.remove_gather(from); // just in case + in->linklock.remove_gather(from); // just in case + in->dirfragtreelock.remove_gather(from); // just in case + in->filelock.remove_gather(from); // just in case + dout(10) << " have (strong) " << *in << endl; + if (ack) { + ack->add_strong_inode(in->ino(), + nonce, + 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state()); + } else { + // note strong replica filelock state requests + //if (p->second.filelock & CAP_FILE_RD) + //filelock_replica_readers.insert(in); + } + } else { + dout(10) << " missing " << p->first << endl; + if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); + missing->add_weak_inode(p->first); + } + } + + // xlocks + for (map >::iterator p = m->xlocked_inodes.begin(); + p != m->xlocked_inodes.end(); + ++p) { + for (map::iterator q = p->second.begin(); + q != p->second.end(); + q++) { + CInode *in = get_inode(p->first); + if (!in) continue; // already missing, from strong_inodes list above. + + dout(10) << " inode xlock by " << q->second << " on " << *in << endl; + + // create slave mdrequest + MDRequest *mdr = request_start(q->second); + + // auth_pin + mdr->auth_pin(in); + + // xlock + SimpleLock *lock = in->get_lock(q->first); + lock->set_state(LOCK_LOCK); + lock->get_xlock(mdr); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + } + } + for (map >::iterator p = m->xlocked_dentries.begin(); + p != m->xlocked_dentries.end(); + ++p) { + CDir *dir = get_dirfrag(p->first); + if (!dir) continue; // already missing, from above. + for (map::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CDentry *dn = dir->lookup(q->first); + if (!dn) continue; // already missing, from above. + dout(10) << " dn xlock by " << q->second << " on " << *dn << endl; + + // create slave mdrequest + MDRequest *mdr = request_start(q->second); + + // auth_pin + mdr->auth_pin(dn->dir); + + // xlock + dn->lock.set_state(LOCK_LOCK); + dn->lock.get_xlock(mdr); + mdr->xlocks.insert(&dn->lock); + mdr->locks.insert(&dn->lock); + } } + + // send ack? + if (ack) + mds->send_message_mds(ack, from, MDS_PORT_CACHE); + else + want_rejoin_ack.insert(from); + + // send missing? + if (missing) + mds->send_message_mds(missing, from, MDS_PORT_CACHE); +} + +void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *m) +{ + dout(7) << "handle_cache_rejoin_ack from " << m->get_source() << endl; + int from = m->get_source().num(); + // dirs - for (set::iterator p = m->dirs.begin(); - p != m->dirs.end(); + for (map::iterator p = m->strong_dirfrags.begin(); + p != m->strong_dirfrags.end(); ++p) { - CInode *diri = get_inode(*p); - assert(diri); - CDir *dir = diri->dir; + CDir *dir = get_dirfrag(p->first); assert(dir); - int nonce = dir->add_replica(from); - dout(10) << " has " << *dir << endl; - ack->add_dir(*p, nonce); - + + dir->set_replica_nonce(p->second.nonce); + dout(10) << " got " << *dir << endl; + // dentries - for (set::iterator q = m->dentries[*p].begin(); - q != m->dentries[*p].end(); + for (map::iterator q = m->strong_dentries[p->first].begin(); + q != m->strong_dentries[p->first].end(); ++q) { - CDentry *dn = dir->lookup(*q); + CDentry *dn = dir->lookup(q->first); assert(dn); - int nonce = dn->add_replica(from); - dout(10) << " has " << *dn << endl; - ack->add_dentry(*p, *q, dn->get_lockstate(), nonce); + dn->set_replica_nonce(q->second.nonce); + dn->lock.set_state(q->second.lock); + dout(10) << " got " << *dn << endl; } } // inodes - for (map::iterator p = m->inodes.begin(); - p != m->inodes.end(); + for (map::iterator p = m->strong_inodes.begin(); + p != m->strong_inodes.end(); ++p) { CInode *in = get_inode(p->first); assert(in); - int nonce = in->add_replica(from); - if (p->second) - in->mds_caps_wanted[from] = p->second; - else - in->mds_caps_wanted.erase(from); - in->hardlock.gather_set.erase(from); // just in case - in->filelock.gather_set.erase(from); // just in case - dout(10) << " has " << *in << endl; - ack->add_inode(p->first, - in->hardlock.get_replica_state(), in->filelock.get_replica_state(), - nonce); + in->set_replica_nonce(p->second.nonce); + in->authlock.set_state(p->second.authlock); + in->linklock.set_state(p->second.linklock); + in->dirfragtreelock.set_state(p->second.dirfragtreelock); + in->filelock.set_state(p->second.filelock); + dout(10) << " got " << *in << endl; + } + + // done? + rejoin_ack_gather.erase(from); + if (mds->is_rejoin() && + rejoin_ack_gather.empty()) { + dout(7) << "all done, going active!" << endl; + send_cache_rejoin_acks(); + + show_subtrees(); + show_cache(); + mds->set_want_state(MDSMap::STATE_ACTIVE); + } else { + dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; } - // send ack - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - - delete m; } -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) +void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) { - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; - int from = m->get_source().num(); - + dout(7) << "handle_cache_rejoin_missing from " << m->get_source() << endl; + + MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); + // dirs - for (list::iterator p = m->dirs.begin(); - p != m->dirs.end(); + for (set::iterator p = m->weak_dirfrags.begin(); + p != m->weak_dirfrags.end(); ++p) { - CInode *diri = get_inode(p->dirino); - CDir *dir = diri->dir; + CDir *dir = get_dirfrag(*p); assert(dir); + dout(10) << " sending " << *dir << endl; + + // dentries + for (set::iterator q = m->weak_dentries[*p].begin(); + q != m->weak_dentries[*p].end(); + ++q) { + CDentry *dn = dir->lookup(*q); + assert(dn); + dout(10) << " sending " << *dn << endl; + if (mds->is_rejoin()) + full->add_weak_dentry(*p, *q); + else + full->add_strong_dentry(*p, *q, dn->get_replica_nonce(), dn->lock.get_state()); + } + } + + // inodes + for (set::iterator p = m->weak_inodes.begin(); + p != m->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + assert(in); + + dout(10) << " sending " << *in << endl; + full->add_full_inode(in->inode, in->symlink, in->dirfragtree); + if (mds->is_rejoin()) + full->add_weak_inode(in->ino()); + else + full->add_strong_inode(in->ino(), + in->get_replica_nonce(), + in->get_caps_wanted(), + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state()); + } + + mds->send_message_mds(full, m->get_source().num(), MDS_PORT_CACHE); +} + +void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *m) +{ + assert(0); // write me +} + +void MDCache::send_cache_rejoin_acks() +{ + dout(7) << "send_cache_rejoin_acks to " << want_rejoin_ack << endl; + + assert(mds->is_rejoin()); - dir->set_replica_nonce(p->nonce); - dout(10) << " got " << *dir << endl; + /* nope, not necessary, we adjust lock state gradually. + after we've processed all rejoins, lockstate is legal. + we just have to do a final _eval-ish thing at the end... - // dentries - for (map::iterator q = m->dentries[p->dirino].begin(); - q != m->dentries[p->dirino].end(); + // calculate proper filelock states + for (set::iterator p = filelock_replica_readers.begin(); + p != filelock_replica_readers.end(); + ++p) { + dout(10) << "replica(s) have RD caps on " << *p->first << endl; + + for (set::iterator q = p->second.begin(); + q != p->second.end(); ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - dn->set_replica_nonce(q->second.nonce); - dn->set_lockstate(q->second.lock); - dout(10) << " got " << *dn << endl; + if (*q == LOCK_ } } + */ - // inodes - for (list::iterator p = m->inodes.begin(); - p != m->inodes.end(); - ++p) { - CInode *in = get_inode(p->ino); - assert(in); - in->set_replica_nonce(p->nonce); - in->hardlock.set_state(p->hardlock); - in->filelock.set_state(p->filelock); - dout(10) << " got " << *in << endl; - } + // send acks + map ack; + + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) { + CDir *dir = p->first; + if (!dir->is_auth()) continue; + dout(10) << "subtree " << *dir << endl; + + // auth items in this subtree + list dq; + dq.push_back(dir); - delete m; + while (!dq.empty()) { + CDir *dir = dq.front(); + dq.pop_front(); + + // dir + for (map::iterator r = dir->replicas_begin(); + r != dir->replicas_end(); + ++r) { + if (!ack[r->first]) ack[r->first] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); + ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second); + } + + for (map::iterator q = dir->items.begin(); + q != dir->items.end(); + ++q) { + CDentry *dn = q->second; - // done? - rejoin_ack_gather.erase(from); - if (rejoin_ack_gather.empty()) { - dout(7) << "all done, going active!" << endl; - show_imports(); - show_cache(); - mds->set_want_state(MDSMap::STATE_ACTIVE); - } else { - dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; - } + // dentry + for (map::iterator r = dn->replicas_begin(); + r != dn->replicas_end(); + ++r) { + //if (!ack[r->first]) ack[r->first] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); + ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, r->second, + dn->lock.get_replica_state()); + } + + if (!dn->is_primary()) continue; -} + // inode + CInode *in = dn->inode; + + // twiddle filelock at all? + // hmm. + for (map::iterator r = in->replicas_begin(); + r != in->replicas_end(); + ++r) { + //if (!ack[r->first]) ack[r->first] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); + ack[r->first]->add_strong_inode(in->ino(), r->second, 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state()); + } + + // subdirs in this subtree? + in->get_nested_dirfrags(dq); + } + } + } + // send acks + for (map::iterator p = ack.begin(); + p != ack.end(); + ++p) + mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); + +} @@ -889,53 +1937,6 @@ void MDCache::set_root(CInode *in) root->state_set(CInode::STATE_ROOT); } -void MDCache::add_import(CDir *dir) -{ - imports.insert(dir); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); -} - - -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << endl; - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->authority() == mds->get_nodeid()) - in->state_set(CInode::STATE_AUTH); - else { - in->state_clear(CInode::STATE_AUTH); - if (in->is_dirty()) - in->mark_clean(); - } - - if (in->parent) { - if (in->parent->authority() == mds->get_nodeid()) - in->parent->state_set(CDentry::STATE_AUTH); - else { - in->parent->state_clear(CDentry::STATE_AUTH); - if (in->parent->is_dirty()) - in->parent->mark_clean(); - } - } - - if (in->dir) { - if (in->dir->authority() == mds->get_nodeid()) - in->dir->state_set(CDIR_STATE_AUTH); - else { - in->dir->state_clear(CDIR_STATE_AUTH); - if (in->dir->is_dirty()) - in->dir->mark_clean(); - } - } - } - show_imports(); - show_cache(); -} @@ -947,78 +1948,115 @@ void MDCache::recalc_auth_bits() class C_MDC_PurgeFinish : public Context { MDCache *mdc; inodeno_t ino; + off_t newsize; public: - C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + C_MDC_PurgeFinish(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} void finish(int r) { - mdc->purge_inode_finish(ino); + mdc->purge_inode_finish(ino, newsize); } }; class C_MDC_PurgeFinish2 : public Context { MDCache *mdc; inodeno_t ino; + off_t newsize; public: - C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + C_MDC_PurgeFinish2(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} void finish(int r) { - mdc->purge_inode_finish_2(ino); + mdc->purge_inode_finish_2(ino, newsize); } }; /* purge_inode in - * will be called by on unlink or rmdir - * caller responsible for journaling an appropriate EUnlink or ERmdir + * will be called by on unlink or rmdir or truncate + * caller responsible for journaling an appropriate EUpdate */ -void MDCache::purge_inode(inode_t &inode) +void MDCache::purge_inode(inode_t *inode, off_t newsize) { - dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl; + dout(10) << "purge_inode " << inode->ino << " size " << inode->size + << " -> " << newsize + << endl; // take note - assert(purging.count(inode.ino) == 0); - purging[inode.ino] = inode; + assert(purging[inode->ino].count(newsize) == 0); + purging[inode->ino][newsize] = *inode; + + assert(inode->size > newsize); // remove - mds->filer->remove(inode, 0, inode.size, - 0, new C_MDC_PurgeFinish(this, inode.ino)); + mds->filer->remove(*inode, newsize, inode->size, + 0, new C_MDC_PurgeFinish(this, inode->ino, newsize)); + + /*} else { + // no need, empty file, just log it + purge_inode_finish(inode->ino, newsize); + } + */ } -void MDCache::purge_inode_finish(inodeno_t ino) +void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize) { - dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl; - + dout(10) << "purge_inode_finish " << ino << " to " << newsize + << " - logging our completion" << endl; + // log completion - mds->mdlog->submit_entry(new EPurgeFinish(ino), - new C_MDC_PurgeFinish2(this, ino)); + mds->mdlog->submit_entry(new EPurgeFinish(ino, newsize), + new C_MDC_PurgeFinish2(this, ino, newsize)); } -void MDCache::purge_inode_finish_2(inodeno_t ino) +void MDCache::purge_inode_finish_2(inodeno_t ino, off_t newsize) { - dout(10) << "purge_inode_finish_2 " << ino << endl; + dout(10) << "purge_inode_finish_2 " << ino << " to " << newsize << endl; // remove from purging list - purging.erase(ino); - + purging[ino].erase(newsize); + if (purging[ino].empty()) + purging.erase(ino); + // tell anyone who cares (log flusher?) list ls; - ls.swap(waiting_for_purge[ino]); - waiting_for_purge.erase(ino); + ls.swap(waiting_for_purge[ino][newsize]); + waiting_for_purge[ino].erase(newsize); + if (waiting_for_purge[ino].empty()) + waiting_for_purge.erase(ino); finish_contexts(ls, 0); +} - // reclaim ino? - +void MDCache::add_recovered_purge(const inode_t& inode, off_t newsize) +{ + assert(purging[inode.ino].count(newsize) == 0); + purging[inode.ino][newsize] = inode; +} + +void MDCache::remove_recovered_purge(inodeno_t ino, off_t newsize) +{ + purging[ino].erase(newsize); } void MDCache::start_recovered_purges() { - for (map::iterator p = purging.begin(); + dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << endl; + + for (map >::iterator p = purging.begin(); p != purging.end(); ++p) { - dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl; - mds->filer->remove(p->second, 0, p->second.size, - 0, new C_MDC_PurgeFinish(this, p->first)); + for (map::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(10) << "start_recovered_purges " << p->first + << " size " << q->second.size + << " to " << q->first << endl; + mds->filer->remove(q->second, q->first, q->second.size, + 0, new C_MDC_PurgeFinish(this, p->first, q->first)); + } } } +// ================================================================================ +// cache trimming + + bool MDCache::trim(int max) { // trim LRU @@ -1030,82 +2068,96 @@ bool MDCache::trim(int max) map expiremap; + // DENTRIES from the LRU + while (lru.lru_get_size() > (unsigned)max) { CDentry *dn = (CDentry*)lru.lru_expire(); if (!dn) break; CDir *dir = dn->get_dir(); assert(dir); + + CDir *con = get_subtree_root(dir); + assert(con); + + dout(12) << "trim removing " << *dn << endl; + dout(12) << " in container " << *con << endl; // notify dentry authority? if (!dn->is_auth()) { - int auth = dn->authority(); - dout(17) << "sending expire to mds" << auth << " on " << *dn << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_dentry(dir->ino(), dn->get_name(), dn->get_replica_nonce()); + pair auth = dn->authority(); + + for (int p=0; p<2; p++) { + int a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds" << a << " on " << *dn << endl; + assert(a != mds->get_nodeid()); + if (expiremap.count(a) == 0) + expiremap[a] = new MCacheExpire(mds->get_nodeid()); + expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); + } } // unlink the dentry - dout(15) << "trim removing " << *dn << endl; - if (!dn->is_null()) + if (dn->is_remote()) { + // just unlink. dir->unlink_inode(dn); - dir->remove_dentry(dn); + } + else if (dn->is_primary()) { + // expire the inode, too. + CInode *in = dn->get_inode(); + assert(in); + trim_inode(dn, in, con, expiremap); + } + else { + assert(dn->is_null()); + } // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! + // NOTE: we can safely remove a clean, null dentry without effecting + // directory completeness. + if (!(dn->is_null() && dn->is_clean())) + dir->state_clear(CDir::STATE_COMPLETE); + + // remove dentry + dir->remove_dentry(dn); // reexport? - if (diri->dir->is_import() && // import - diri->dir->get_size() == 0 && // no children - !diri->is_root()) // not root - migrator->export_empty_import(diri->dir); + if (dir->get_size() == 0 && dir->is_subtree_root()) + migrator->export_empty_import(dir); if (mds->logger) mds->logger->inc("cex"); } - // inode expire_queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - - assert(in->get_num_ref() == 0); - - int dirauth = -2; - if (in->dir) { - // notify dir authority? - dirauth = in->dir->authority(); - if (dirauth != mds->get_nodeid()) { - dout(17) << "sending expire to mds" << dirauth << " on " << *in->dir << endl; - if (expiremap.count(dirauth) == 0) - expiremap[dirauth] = new MCacheExpire(mds->get_nodeid()); - expiremap[dirauth]->add_dir(in->ino(), in->dir->replica_nonce); - } - - in->close_dir(); - } - - // notify inode authority - int auth = in->authority(); - if (auth == CDIR_AUTH_UNKNOWN) { - assert(in->ino() == 1); - assert(dirauth >= 0); - auth = dirauth; - } - if (auth != mds->get_nodeid()) { - assert(!in->is_auth()); - dout(17) << "sending expire to mds" << auth << " on " << *in << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_inode(in->ino(), in->get_replica_nonce()); - } else { - assert(in->is_auth()); - } + // trim root inode+dir? + if (max == 0 && // only if we're trimming everything! + lru.lru_get_size() == 0) { + hash_map::iterator p = inode_map.begin(); + while (p != inode_map.end()) { + hash_map::iterator n = p; + n++; + + CInode *in = p->second; - dout(15) << "trim removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); + list ls; + in->get_dirfrags(ls); + for (list::iterator q = ls.begin(); + q != ls.end(); + ++q) + if ((*q)->get_num_ref() == 0) + trim_dirfrag(*q, *q, expiremap); + + // root inode? + if (in->get_num_ref() == 0) + trim_inode(0, in, 0, expiremap); // hrm, FIXME + + p = n; + } } // send expires @@ -1116,15 +2168,103 @@ bool MDCache::trim(int max) mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); } - return true; } +void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) +{ + assert(dir->get_num_ref() == 0); + + dout(15) << "trim_dirfrag " << *dir << endl; + + CInode *in = dir->get_inode(); + + if (!dir->is_auth()) { + pair auth = dir->authority(); + + // was this an auth delegation? (if so, slightly modified container) + dirfrag_t condf; + if (dir->is_subtree_root()) { + dout(12) << " subtree root, container is " << *dir << endl; + con = dir; + condf = dir->dirfrag(); + } else { + condf = con->dirfrag(); + } + + for (int p=0; p<2; p++) { + int a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds" << a << " on " << *dir << endl; + assert(a != mds->get_nodeid()); + if (expiremap.count(a) == 0) + expiremap[a] = new MCacheExpire(mds->get_nodeid()); + expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); + } + } + + if (dir->is_subtree_root()) + remove_subtree(dir); // remove from subtree map + in->close_dirfrag(dir->dirfrag().frag); +} + +void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) +{ + dout(15) << "trim_inode " << *in << endl; + assert(in->get_num_ref() == 0); + + // DIR + list dfls; + in->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); + p != dfls.end(); + ++p) + trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p + + // INODE + if (!in->is_auth()) { + pair auth = in->authority(); + + dirfrag_t df; + if (con) + df = con->dirfrag(); + else + df = dirfrag_t(1,frag_t()); + + for (int p=0; p<2; p++) { + int a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (con && mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds" << a << " on " << *in << endl; + assert(a != mds->get_nodeid()); + if (expiremap.count(a) == 0) + expiremap[a] = new MCacheExpire(mds->get_nodeid()); + expiremap[a]->add_inode(df, in->ino(), in->get_replica_nonce()); + } + } + + // unlink + if (dn) + dn->get_dir()->unlink_inode(dn); + remove_inode(in); +} + void MDCache::trim_non_auth() { dout(7) << "trim_non_auth" << endl; + // note first auth item we see. + // when we see it the second time, stop. CDentry *first_auth = 0; // trim non-auth items from the lru @@ -1135,7 +2275,7 @@ void MDCache::trim_non_auth() if (dn->is_auth()) { // add back into lru (at the top) lru.lru_insert_top(dn); - + if (!first_auth) { first_auth = dn; } else { @@ -1149,28 +2289,240 @@ void MDCache::trim_non_auth() // unlink the dentry dout(15) << "trim_non_auth removing " << *dn << endl; - if (!dn->is_null()) + if (dn->is_remote()) { + dir->unlink_inode(dn); + } + else if (dn->is_primary()) { + CInode *in = dn->get_inode(); + list ls; + in->get_dirfrags(ls); + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *subdir = *p; + if (subdir->is_subtree_root()) + remove_subtree(subdir); + in->close_dirfrag(subdir->dirfrag().frag); + } dir->unlink_inode(dn); + remove_inode(in); + } + else { + assert(dn->is_null()); + } dir->remove_dentry(dn); // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! + dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! + } + } + + if (lru.lru_get_size() == 0) { + // root, stray, etc.? + hash_map::iterator p = inode_map.begin(); + while (p != inode_map.end()) { + hash_map::iterator next = p; + ++next; + CInode *in = p->second; + if (!in->is_auth()) { + list ls; + in->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + assert((*p)->get_num_ref() == 0); + remove_subtree((*p)); + in->close_dirfrag((*p)->dirfrag().frag); + } + assert(in->get_num_ref() == 0); + remove_inode(in); + } + p = next; + } + } + + show_subtrees(); +} + +void MDCache::handle_cache_expire(MCacheExpire *m) +{ + int from = m->get_from(); + + dout(7) << "cache_expire from mds" << from << endl; + + // loop over realms + for (map::iterator p = m->realms.begin(); + p != m->realms.end(); + ++p) { + // get container + CDir *con = get_dirfrag(p->first); + assert(con); // we had better have this. + + if (!con->is_auth() || + (con->is_auth() && con->is_exporting() && + migrator->get_export_state(con) == Migrator::EXPORT_WARNING && + migrator->export_has_warned(con,from))) { + // not auth. + dout(7) << "delaying nonauth|warned expires for " << *con << endl; + assert(con->is_frozen_tree_root()); + + // make a message container + if (delayed_expire[con].count(from) == 0) + delayed_expire[con][from] = new MCacheExpire(from); + + // merge these expires into it + delayed_expire[con][from]->add_realm(p->first, p->second); + continue; + } + dout(7) << "expires for " << *con << endl; + + // INODES + for (map::iterator it = p->second.inodes.begin(); + it != p->second.inodes.end(); + it++) { + CInode *in = get_inode(it->first); + int nonce = it->second; + + if (!in) { + dout(0) << " inode expire on " << it->first << " from " << from << ", don't have it" << endl; + assert(in); + } + assert(in->is_auth()); + + // check nonce + if (nonce == in->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; + inode_remove_replica(in, from); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " inode expire on " << *in << " from mds" << from + << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" + << endl; + assert(in->get_replica_nonce(from) > nonce); + } + } + + // DIRS + for (map::iterator it = p->second.dirs.begin(); + it != p->second.dirs.end(); + it++) { + CDir *dir = get_dirfrag(it->first); + int nonce = it->second; + + if (!dir) { + dout(0) << " dir expire on " << it->first << " from " << from << ", don't have it" << endl; + assert(dir); + } + assert(dir->is_auth()); + + // check nonce + if (nonce == dir->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " dir expire on " << *dir << " from mds" << from + << " replicas was " << dir->replicas << endl; + dir->remove_replica(from); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " dir expire on " << *dir << " from mds" << from + << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) + << "), dropping" << endl; + assert(dir->get_replica_nonce(from) > nonce); + } + } + + // DENTRIES + for (map >::iterator pd = p->second.dentries.begin(); + pd != p->second.dentries.end(); + ++pd) { + dout(0) << " dn expires in dir " << pd->first << endl; + CDir *dir = get_dirfrag(pd->first); + + if (!dir) { + dout(0) << " dn expires on " << pd->first << " from " << from << ", don't have it" << endl; + assert(dir); + } + assert(dir->is_auth()); + + for (map::iterator p = pd->second.begin(); + p != pd->second.end(); + ++p) { + int nonce = p->second; + + CDentry *dn = dir->lookup(p->first); + if (!dn) + dout(0) << " missing dentry for " << p->first << " in " << *dir << endl; + assert(dn); + + if (nonce == dn->get_replica_nonce(from)) { + dout(7) << " dentry_expire on " << *dn << " from mds" << from << endl; + dn->remove_replica(from); + } + else { + dout(7) << " dentry_expire on " << *dn << " from mds" << from + << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) + << "), dropping" << endl; + assert(dn->get_replica_nonce(from) > nonce); + } + } } } - // inode expire queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - dout(15) << "trim_non_auth removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); + + // done + delete m; +} + +void MDCache::process_delayed_expire(CDir *dir) +{ + dout(7) << "process_delayed_expire on " << *dir << endl; + for (map::iterator p = delayed_expire[dir].begin(); + p != delayed_expire[dir].end(); + ++p) + handle_cache_expire(p->second); + delayed_expire.erase(dir); +} + +void MDCache::discard_delayed_expire(CDir *dir) +{ + dout(7) << "discard_delayed_expire on " << *dir << endl; + for (map::iterator p = delayed_expire[dir].begin(); + p != delayed_expire[dir].end(); + ++p) + delete p->second; + delayed_expire.erase(dir); +} + +void MDCache::inode_remove_replica(CInode *in, int from) +{ + in->remove_replica(from); + in->mds_caps_wanted.erase(from); + + // note: this code calls _eval more often than it needs to! + // fix lock + if (in->authlock.remove_replica(from)) + mds->locker->simple_eval(&in->authlock); + if (in->linklock.remove_replica(from)) + mds->locker->simple_eval(&in->linklock); + if (in->dirfragtreelock.remove_replica(from)) + mds->locker->simple_eval(&in->dirfragtreelock); + if (in->filelock.remove_replica(from)) + mds->locker->simple_eval(&in->filelock); + + // alone now? + if (!in->is_replicated()) { + mds->locker->simple_eval(&in->authlock); + mds->locker->simple_eval(&in->linklock); + mds->locker->simple_eval(&in->dirfragtreelock); + mds->locker->file_eval(&in->filelock); } } +// ========================================================================================= +// shutdown + class C_MDC_ShutdownCommit : public Context { MDCache *mdc; public: @@ -1207,9 +2559,6 @@ void MDCache::shutdown_check() dout(0) << "log len " << mds->mdlog->get_num_events() << endl; - if (exports.size()) - dout(0) << "still have " << exports.size() << " exports" << endl; - if (mds->filer->is_active()) dout(0) << "filer still active" << endl; } @@ -1227,30 +2576,14 @@ void MDCache::shutdown_start() bool MDCache::shutdown_pass() { dout(7) << "shutdown_pass" << endl; - //assert(mds->is_shutting_down()); + if (mds->is_out()) { dout(7) << " already shut down" << endl; show_cache(); - show_imports(); + show_subtrees(); return true; } - // unhash dirs? - if (!hashdirs.empty()) { - // unhash any of my dirs? - for (set::iterator it = hashdirs.begin(); - it != hashdirs.end(); - it++) { - CDir *dir = *it; - if (!dir->is_auth()) continue; - if (dir->is_unhashing()) continue; - migrator->unhash_dir(dir); - } - - dout(7) << "waiting for dirs to unhash" << endl; - return false; - } - // commit dirs? if (g_conf.mds_commit_on_shutdown) { @@ -1262,12 +2595,18 @@ bool MDCache::shutdown_pass() it != inode_map.end(); it++) { CInode *in = it->second; + if (!in->is_dir()) continue; - // commit any dirty dir that's ours - if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) { - mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } + // commit any dirty dirfrag that's ours + list dfs; + in->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *dir = *p; + if (dir->is_auth() && dir->is_dirty()) { + dir->commit(0, new C_MDC_ShutdownCommit(this)); + shutdown_commits++; + } + } } } @@ -1282,93 +2621,78 @@ bool MDCache::shutdown_pass() trim(0); dout(5) << "lru size now " << lru.lru_get_size() << endl; - mds->mdlog->trim(0); - - // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown) { - if (mds->mdlog->get_non_importmap_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; - return false; - } - } + // flush batching eopens, so that we can properly expire them. + mds->server->journal_opens(); // hrm, this is sort of a hack. + // flush what we can from the log + mds->mdlog->trim(0); + // SUBTREES // send all imports back to 0. - if (mds->get_nodeid() != 0 && !did_shutdown_exports) { - // flush what i can from the cache first.. - trim(0); - + if (!subtrees.empty() && + mds->get_nodeid() != 0 && + !migrator->is_exporting() && + !migrator->is_importing()) { // export to root - for (set::iterator it = imports.begin(); - it != imports.end(); - ) { - CDir *im = *it; - it++; - if (im->inode->is_root()) continue; - if (im->is_frozen() || im->is_freezing()) continue; - - dout(7) << "sending " << *im << " back to mds0" << endl; - migrator->export_dir(im,0); + dout(7) << "looking for subtrees to export to mds0" << endl; + list ls; + for (map >::iterator it = subtrees.begin(); + it != subtrees.end(); + it++) { + CDir *dir = it->first; + if (dir->get_inode()->is_stray()) continue; + if (dir->is_frozen() || dir->is_freezing()) continue; + if (!dir->is_full_dir_auth()) continue; + ls.push_back(dir); } - did_shutdown_exports = true; - } - + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + dout(7) << "sending " << *dir << " back to mds0" << endl; + migrator->export_dir(dir, 0); + } + } - // waiting for imports? (e.g. root?) - if (exports.size()) { - dout(7) << "still have " << exports.size() << " exports" << endl; - //show_cache(); + // subtrees map not empty yet? + if (!subtrees.empty()) { + dout(7) << "still have " << num_subtrees() << " subtrees" << endl; + show_subtrees(); + show_cache(); return false; } + assert(subtrees.empty()); + assert(!migrator->is_exporting()); + assert(!migrator->is_importing()); - - // close root? - if (mds->get_nodeid() == 0 && - lru.lru_get_size() == 0 && - root && - root->dir && - root->dir->is_import() && - root->dir->get_num_ref() == 1) { // 1 is the import! - // un-import - dout(7) << "removing root import" << endl; - imports.erase(root->dir); - root->dir->state_clear(CDIR_STATE_IMPORT); - root->dir->put(CDir::PIN_IMPORT); - if (root->is_pinned_by(CInode::PIN_DIRTY)) { - dout(7) << "clearing root inode dirty flag" << endl; - root->put(CInode::PIN_DIRTY); - } + // empty out stray contents + // FIXME + dout(7) << "FIXME: i need to empty out stray dir contents..." << endl; - trim(0); - } - - // imports? - if (!imports.empty() || migrator->is_exporting()) { - dout(7) << "still have " << imports.size() << " imports, or still exporting" << endl; - show_cache(); - return false; + // (wait for) flush log? + if (g_conf.mds_log_flush_on_shutdown) { + if (mds->mdlog->get_non_importmap_events()) { + dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() + << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; + return false; + } } - + // cap log? if (g_conf.mds_log_flush_on_shutdown) { - if (imports.empty() && exports.empty()) { - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << endl; - mds->mdlog->cap(); - // note that this won't flush right away, so we'll make at least one more pass - } + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the log" << endl; + mds->mdlog->cap(); + // note that this won't flush right away, so we'll make at least one more pass } - + if (mds->mdlog->get_num_events()) { dout(7) << "waiting for log to flush (including import_map, now) .. " << mds->mdlog->get_num_events() << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; return false; } - + if (!did_shutdown_log_cap) { // flush journal header dout(7) << "writing header for (now-empty) journal" << endl; @@ -1404,79 +2728,6 @@ bool MDCache::shutdown_pass() -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = 1; - root->inode.hash_seed = 0; // not hashed! - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = 0; - root->inode.mtime = g_clock.gettime(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - set_root( root ); - add_inode( root ); - - return root; -} - - -int MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - assert(root->dir == NULL); - root->set_dir( new CDir(root, this, true) ); - root->dir->set_dir_auth( 0 ); // me! - root->dir->dir_rep = CDIR_REP_ALL; //NONE; - - // root is sort of technically an import (from a vacuum) - imports.insert( root->dir ); - root->dir->state_set(CDIR_STATE_IMPORT); - root->dir->get(CDir::PIN_IMPORT); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - if (waiting_for_root.empty()) { - dout(7) << "discovering root" << endl; - - filepath want; - MDiscover *req = new MDiscover(whoami, - 0, - want, - false); // there _is_ no base dir for the root inode - mds->send_message_mds(req, 0, MDS_PORT_CACHE); - } else { - dout(7) << "waiting for root" << endl; - } - - // wait - waiting_for_root.push_back(c); - - } - - return 0; -} - - - - @@ -1495,9 +2746,11 @@ void MDCache::dispatch(Message *m) case MSG_MDS_CACHEREJOIN: handle_cache_rejoin((MMDSCacheRejoin*)m); break; + /* case MSG_MDS_CACHEREJOINACK: handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m); break; + */ case MSG_MDS_DISCOVER: @@ -1516,9 +2769,6 @@ void MDCache::dispatch(Message *m) case MSG_MDS_INODELINK: handle_inode_link((MInodeLink*)m); break; - case MSG_MDS_INODELINKACK: - handle_inode_link_ack((MInodeLinkAck*)m); - break; case MSG_MDS_DIRUPDATE: handle_dir_update((MDirUpdate*)m); @@ -1559,6 +2809,7 @@ void MDCache::dispatch(Message *m) * the context is needed to pass a (failure) result code. */ +/* class C_MDC_TraverseDiscover : public Context { Context *onfinish, *ondelay; public: @@ -1577,17 +2828,19 @@ class C_MDC_TraverseDiscover : public Context { delete ondelay; } }; +*/ -int MDCache::path_traverse(filepath& origpath, +int MDCache::path_traverse(MDRequest *mdr, + CInode *base, // traverse starting from here. + filepath& origpath, vector& trace, bool follow_trailing_symlink, Message *req, Context *ondelay, int onfail, - Context *onfinish, - bool is_client_req) // true if req is MClientRequest .. gross, FIXME + bool is_client_req, + bool null_okay) // true if req is MClientRequest .. gross, FIXME { - int whoami = mds->get_nodeid(); set< pair > symlinks_resolved; // keep a list of symlinks we touch to avoid loops bool noperm = false; @@ -1595,11 +2848,11 @@ int MDCache::path_traverse(filepath& origpath, onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true; // root - CInode *cur = get_root(); + CInode *cur = base; + if (!cur) cur = get_root(); if (cur == NULL) { dout(7) << "traverse: i don't have root" << endl; open_root(ondelay); - if (onfinish) delete onfinish; return 1; } @@ -1617,61 +2870,63 @@ int MDCache::path_traverse(filepath& origpath, if (!cur->is_dir()) { dout(7) << "traverse: " << *cur << " not a dir " << endl; delete ondelay; - if (onfinish) { - onfinish->finish(-ENOTDIR); - delete onfinish; - } return -ENOTDIR; } // open dir - if (!cur->dir) { - if (cur->dir_is_auth()) { + frag_t fg = cur->pick_dirfrag(path[depth]); + CDir *curdir = cur->get_dirfrag(fg); + if (!curdir) { + if (cur->is_auth()) { // parent dir frozen_dir? if (cur->is_frozen_dir()) { dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; + cur->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, ondelay); return 1; } - cur->get_or_open_dir(this); - assert(cur->dir); + curdir = cur->get_or_open_dirfrag(this, fg); } else { - // discover dir from/via inode auth + // discover? assert(!cur->is_auth()); - if (cur->waiting_for(CINODE_WAIT_DIR)) { - dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl; - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), + if (cur->is_waiter_for(CInode::WAIT_DIR)) { + dout(10) << "traverse: need dir, already doing discover for " << *cur << endl; + } + else if (cur->is_ambiguous_auth()) { + dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl; + cur->add_waiter(CInode::WAIT_SINGLEAUTH, ondelay); + return 1; + } else { + filepath want = path.postfixpath(depth); + dout(10) << "traverse: need dir, doing discover, want " << want.get_path() + << " from " << *cur << endl; + mds->send_message_mds(new MDiscover(mds->get_nodeid(), cur->ino(), want, true), // need this dir too - cur->authority(), MDS_PORT_CACHE); - } - cur->add_waiter(CINODE_WAIT_DIR, ondelay); - if (onfinish) delete onfinish; + cur->authority().first, MDS_PORT_CACHE); + dir_discovers[cur->ino()].insert(cur->authority().first); + } + cur->add_waiter(CInode::WAIT_DIR, ondelay); return 1; } } - + assert(curdir); + // frozen? /* - if (cur->dir->is_frozen()) { + if (curdir->is_frozen()) { // doh! // FIXME: traverse is allowed? - dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); + dout(7) << "traverse: " << *curdir << " is frozen, waiting" << endl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, ondelay); if (onfinish) delete onfinish; return 1; } */ // must read directory hard data (permissions, x bit) to traverse - if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) { - if (onfinish) delete onfinish; + if (!noperm && !mds->locker->simple_rdlock_try(&cur->authlock, ondelay)) { return 1; } @@ -1689,25 +2944,20 @@ int MDCache::path_traverse(filepath& origpath, // dentry - CDentry *dn = cur->dir->lookup(path[depth]); + CDentry *dn = curdir->lookup(path[depth]); // null and last_bit and xlocked by me? - if (dn && dn->is_null() && - dn->is_xlockedbyme(req) && - depth == path.depth()-1) { - dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl; + if (dn && dn->is_null() && null_okay) { + dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << endl; trace.push_back(dn); break; // done! } if (dn && !dn->is_null()) { // dentry exists. xlocked? - if (!noperm && dn->is_xlockedbyother(req)) { + if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { dout(10) << "traverse: xlocked dentry at " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - path[depth], - ondelay); - if (onfinish) delete onfinish; + dn->lock.add_waiter(SimpleLock::WAIT_RD, ondelay); return 1; } @@ -1721,8 +2971,8 @@ int MDCache::path_traverse(filepath& origpath, dn->link_remote(in); } else { dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; - open_remote_ino(dn->get_remote_ino(), req, - ondelay); + assert(0); // REWRITE ME + //open_remote_ino(dn->get_remote_ino(), req, ondelay); return 1; } } @@ -1762,58 +3012,51 @@ int MDCache::path_traverse(filepath& origpath, dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl; } continue; - } else { - // keep going. + } - // forwarder wants replicas? - if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl; - - if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() && - cur->dir->is_auth() && - cur->dir->is_rep() && - cur->dir->is_replica(req->get_source().num()) && - dn->get_inode()->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->get_inode()->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << endl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; - MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino()); - reply->add_dentry( dn->replicate_to( from ) ); - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; + // forwarder wants replicas? + if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) { + dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << curdir->dirfrag() << endl; + + if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == curdir->ino() && + curdir->is_auth() && + curdir->is_rep() && + curdir->is_replica(req->get_source().num()) && + dn->is_auth() + ) { + assert(req->get_source().is_mds()); + int from = req->get_source().num(); + + if (dn->is_replica(from)) { + dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " + << req->get_source() << " dn " << *dn << endl; + } else { + dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; + MDiscoverReply *reply = new MDiscoverReply(curdir->ino()); + reply->add_dentry( dn->replicate_to( from ) ); + if (dn->is_primary()) + reply->add_inode( dn->inode->replicate_to( from ) ); + mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); + } + } } + + // add to trace, continue. + trace.push_back(dn); + cur = dn->inode; + touch_inode(cur); + depth++; + continue; } - // MISS. don't have it. - - int dauth = cur->dir->dentry_authority( path[depth] ); - dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl; + // MISS. dentry doesn't exist. + dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << endl; - - if (dauth == whoami) { + if (curdir->is_auth()) { // dentry is mine. - if (cur->dir->is_complete()) { + if (curdir->is_complete()) { // file not found delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); - delete onfinish; - } return -ENOENT; } else { @@ -1824,79 +3067,75 @@ int MDCache::path_traverse(filepath& origpath, // directory isn't complete; reload dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; touch_inode(cur); - mds->mdstore->fetch_dir(cur->dir, ondelay); + curdir->fetch(ondelay); if (mds->logger) mds->logger->inc("cmiss"); - if (onfinish) delete onfinish; return 1; } } else { - // dentry is not mine. - - /* no, let's let auth handle the discovery/replication .. - if (onfail == MDS_TRAVERSE_FORWARD && - onfinish == 0 && // no funnyness - cur->dir->is_rep()) { - dout(5) << "trying to discover in popular dir " << *cur->dir << endl; - onfail = MDS_TRAVERSE_DISCOVER; - } - */ + // dirfrag/dentry is not mine. + pair dauth = curdir->authority(); if ((onfail == MDS_TRAVERSE_DISCOVER || onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover - + // discover? filepath want = path.postfixpath(depth); - if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) { - dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - } else { - dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - + + if (curdir->is_waiting_for_dentry(path[depth])) { + dout(7) << "traverse: already waiting for discover " << want.get_path() + << " from " << *curdir << endl; + } + else if (curdir->is_ambiguous_auth()) { + dout(7) << "traverse: waiting for single auth on " << *curdir << endl; + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, ondelay); + return 1; + } else { + dout(7) << "traverse: discover " << want << " from " << *curdir << endl; touch_inode(cur); - + mds->send_message_mds(new MDiscover(mds->get_nodeid(), cur->ino(), want, false), - dauth, MDS_PORT_CACHE); + dauth.first, MDS_PORT_CACHE); if (mds->logger) mds->logger->inc("dis"); } // delay processing of current request. - // delay finish vs ondelay until result of traverse, so that ENOENT can be - // passed to onfinish if necessary - cur->dir->add_waiter(CDIR_WAIT_DENTRY, - path[depth], - new C_MDC_TraverseDiscover(onfinish, ondelay)); - + curdir->add_dentry_waiter(path[depth], ondelay); if (mds->logger) mds->logger->inc("cmiss"); return 1; } if (onfail == MDS_TRAVERSE_FORWARD) { // forward - dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl; - - if (is_client_req && cur->dir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl; - ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino()); - req->clear_payload(); // reencode! - } - - mds->send_message_mds(req, dauth, req->get_dest_port()); - //show_imports(); - - if (mds->logger) mds->logger->inc("cfw"); - if (onfinish) delete onfinish; - delete ondelay; - return 2; + dout(7) << "traverse: not auth for " << path << " in " << *curdir << endl; + + if (curdir->is_ambiguous_auth()) { + // wait + dout(7) << "traverse: waiting for single auth in " << *curdir << endl; + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, ondelay); + return 1; + } else { + dout(7) << "traverse: forwarding, not auth for " << *curdir << endl; + + // request replication? + if (is_client_req && curdir->is_rep()) { + dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " + << *curdir << " req " << *(MClientRequest*)req << endl; + ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(curdir->ino()); + req->clear_payload(); // reencode! + } + + mds->forward_message_mds(req, dauth.first, req->get_dest_port()); + + if (mds->logger) mds->logger->inc("cfw"); + delete ondelay; + return 2; + } } if (onfail == MDS_TRAVERSE_FAIL) { delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority! - delete onfinish; - } return -ENOENT; // not necessarily exactly true.... } } @@ -1906,54 +3145,81 @@ int MDCache::path_traverse(filepath& origpath, // success. delete ondelay; - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } return 0; } -void MDCache::open_remote_dir(CInode *diri, - Context *fin) +void MDCache::open_remote_dir(CInode *diri, frag_t fg, Context *fin) { dout(10) << "open_remote_dir on " << *diri << endl; assert(diri->is_dir()); - assert(!diri->dir_is_auth()); assert(!diri->is_auth()); - assert(diri->dir == 0); + assert(diri->get_dirfrag(fg) == 0); + + int auth = diri->authority().first; + + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + // discover it + filepath want; // no dentries, i just want the dir open + MDiscover *dis = new MDiscover(mds->get_nodeid(), + diri->ino(), + want, + true); // need the base dir open + dis->set_base_dir_frag(fg); + mds->send_message_mds(dis, auth, MDS_PORT_CACHE); + dir_discovers[diri->ino()].insert(auth); + diri->add_waiter(CInode::WAIT_DIR, fin); + } else { + // mds is down or recovering. forge a replica! + forge_replica_dir(diri, fg, auth); + } +} - filepath want; // no dentries, i just want the dir open - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - diri->ino(), - want, - true), // need the dir open - diri->authority(), MDS_PORT_CACHE); - diri->add_waiter(CINODE_WAIT_DIR, fin); +/** get_dentry_inode + * will return inode for primary, or link up/open up remote link's inode as necessary. + */ +CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) +{ + assert(!dn->is_null()); + + if (dn->is_primary()) + return dn->inode; + + assert(dn->is_remote()); + CInode *in = get_inode(dn->get_remote_ino()); + if (in) { + dout(7) << "get_dentry_inode linking in remote in " << *in << endl; + dn->link_remote(in); + return in; + } else { + dout(10) << "get_dentry_ninode on remote dn, opening inode for " << *dn << endl; + open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); + return 0; + } } - -class C_MDC_OpenRemoteInoLookup : public Context { - MDCache *mdc; +class C_MDC_OpenRemoteIno : public Context { + MDCache *mdcache; inodeno_t ino; - Message *req; + MDRequest *mdr; Context *onfinish; public: - vector anchortrace; - C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) { - this->mdc = mdc; - this->ino = ino; - this->req = req; - this->onfinish = onfinish; - } + vector anchortrace; + + C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : + mdcache(mdc), ino(i), mdr(r), onfinish(c) {} + C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, vector& at, + MDRequest *r, Context *c) : + mdcache(mdc), ino(i), mdr(r), onfinish(c), anchortrace(at) {} + void finish(int r) { assert(r == 0); if (r == 0) - mdc->open_remote_ino_2(ino, req, anchortrace, onfinish); + mdcache->open_remote_ino_2(ino, mdr, anchortrace, onfinish); else { onfinish->finish(r); delete onfinish; @@ -1962,103 +3228,94 @@ public: }; void MDCache::open_remote_ino(inodeno_t ino, - Message *req, + MDRequest *mdr, Context *onfinish) { dout(7) << "open_remote_ino on " << ino << endl; - C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish); + C_MDC_OpenRemoteIno *c = new C_MDC_OpenRemoteIno(this, ino, mdr, onfinish); mds->anchorclient->lookup(ino, c->anchortrace, c); } void MDCache::open_remote_ino_2(inodeno_t ino, - Message *req, - vector& anchortrace, + MDRequest *mdr, + vector& anchortrace, Context *onfinish) { - dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl; - - // construct path - filepath path; - for (unsigned i=0; iref_dn); - - dout(7) << " path is " << path << endl; - - vector trace; - int r = path_traverse(path, trace, false, - req, - onfinish, // delay actually - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; + dout(7) << "open_remote_ino_2 on " << ino + << ", trace depth is " << anchortrace.size() << endl; - onfinish->finish(r); - delete onfinish; -} - - - - -// path pins - -bool MDCache::path_pin(vector& trace, - Message *m, - Context *c) -{ - // verify everything is pinnable - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->is_pinnable(m)) { - // wait - if (c) { - dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE, - dn->name, - c); - } else { - dout(10) << "path_pin can't pin, no waiter, failing." << endl; - } - return false; + // find deepest cached inode in prefix + unsigned i = anchortrace.size(); // i := array index + 1 + CInode *in = 0; + while (1) { + // inode? + CInode *in = get_inode(anchortrace[i-1].ino); + if (in) break; + i--; + if (!i) { + in = root; + break; } } + dout(10) << "deepest cached inode at " << i << " is " << *in << endl; - // pin! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - (*it)->pin(m); - dout(11) << "path_pinned " << *(*it) << endl; - } - - delete c; - return true; -} + if (in->ino() == ino) { + // success + dout(10) << "open_remote_ino_2 have " << *in << endl; + onfinish->finish(0); + delete onfinish; + return; + } + // open dirfrag beneath *in + frag_t frag = anchortrace[i].dirfrag.frag; -void MDCache::path_unpin(vector& trace, - Message *m) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - dn->unpin(m); - dout(11) << "path_unpinned " << *dn << endl; + if (!in->dirfragtree.contains(frag)) { + dout(10) << "frag " << frag << " not valid, requerying anchortable" << endl; + open_remote_ino(ino, mdr, onfinish); + return; + } - // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { - // return state to sync, in case the unpinner flails - dn->lockstate = DN_LOCK_SYNC; + if (!in->is_auth()) { + dout(10) << "opening remote dirfrag " << frag << " under " << *in << endl; + open_remote_dir(in, frag, + new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); + return; + } - // run finisher right now to give them a fair shot. - dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name); + CDir *dir = in->get_or_open_dirfrag(this, frag); + assert(dir); + if (dir->is_auth()) { + if (dir->is_complete()) { + // hrm. requery anchor table. + dout(10) << "expected ino " << anchortrace[i].ino + << " in complete dir " << *dir + << ", requerying anchortable" + << endl; + open_remote_ino(ino, mdr, onfinish); + } else { + dout(10) << "need ino " << anchortrace[i].ino + << ", fetching incomplete dir " << *dir + << endl; + dir->fetch(new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); } + } else { + // hmm, discover. + dout(10) << "have remote dirfrag " << *dir << ", discovering " + << anchortrace[i].ino << endl; + + MDiscover *dis = new MDiscover(mds->get_nodeid(), + dir->dirfrag(), + anchortrace[i].ino, + true); // being conservative here. + mds->send_message_mds(dis, dir->authority().first, MDS_PORT_CACHE); } } + + void MDCache::make_trace(vector& trace, CInode *in) { CInode *parent = in->get_parent_inode(); @@ -2072,119 +3329,124 @@ void MDCache::make_trace(vector& trace, CInode *in) } -bool MDCache::request_start(Message *req, - CInode *ref, - vector& trace) +MDRequest *MDCache::request_start(metareqid_t ri) { - assert(active_requests.count(req) == 0); + MDRequest *mdr = new MDRequest(ri); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start " << *mdr << endl; + return mdr; +} - // pin path - if (trace.size()) { - if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false; - } +MDRequest *MDCache::request_start(MClientRequest *req) +{ + MDRequest *mdr = new MDRequest(req->get_reqid(), req); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start " << *mdr << endl; + return mdr; +} - dout(7) << "request_start " << *req << endl; +MDRequest *MDCache::request_start(MLock *req) +{ + MDRequest *mdr = new MDRequest(req->get_reqid(), req); + mdr->by_mds = req->get_source().num(); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start " << *mdr << endl; + return mdr; +} + +MDRequest *MDCache::request_get(metareqid_t rid) +{ + assert(active_requests.count(rid)); + dout(7) << "request_get " << rid << " " << *active_requests[rid] << endl; + return active_requests[rid]; +} - // add to map - active_requests[req].ref = ref; - if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace; +void MDCache::request_finish(MDRequest *mdr) +{ + dout(7) << "request_finish " << *mdr << endl; - // request pins - request_pin_inode(req, ref); + delete mdr->request; + request_cleanup(mdr); - if (mds->logger) mds->logger->inc("req"); - - return true; + if (mds->logger) mds->logger->inc("reply"); } -void MDCache::request_pin_inode(Message *req, CInode *in) +void MDCache::request_forward(MDRequest *mdr, int who, int port) { - if (active_requests[req].request_pins.count(in) == 0) { - in->request_pin_get(); - active_requests[req].request_pins.insert(in); - } + if (!port) port = MDS_PORT_SERVER; + + dout(7) << "request_forward to " << who << " req " << *mdr << endl; + + mds->forward_message_mds(mdr->request, who, port); + request_cleanup(mdr); + + if (mds->logger) mds->logger->inc("fw"); } -void MDCache::request_pin_dir(Message *req, CDir *dir) + +void MDCache::dispatch_request(MDRequest *mdr) { - if (active_requests[req].request_dir_pins.count(dir) == 0) { - dir->request_pin_get(); - active_requests[req].request_dir_pins.insert(dir); + assert(mdr->request); + + switch (mdr->request->get_type()) { + case MSG_CLIENT_REQUEST: + mds->server->dispatch_request(mdr); + break; + + case MSG_MDS_LOCK: + mds->locker->handle_lock((MLock*)mdr->request); + break; + + default: + assert(0); // shouldn't get here } } -void MDCache::request_cleanup(Message *req) + + +void MDCache::request_drop_locks(MDRequest *mdr) { - assert(active_requests.count(req) == 1); + // leftover locks + while (!mdr->xlocks.empty()) + mds->locker->xlock_finish(*mdr->xlocks.begin(), mdr); + while (!mdr->rdlocks.empty()) + mds->locker->rdlock_finish(*mdr->rdlocks.begin(), mdr); + while (!mdr->wrlocks.empty()) + mds->locker->wrlock_finish(*mdr->wrlocks.begin(), mdr); + + // make sure ref and trace are empty + // if we are doing our own locking, we can't use them! + assert(mdr->ref == 0); + assert(mdr->trace.empty()); +} - // leftover xlocks? - if (active_requests[req].xlocks.size()) { - set dns = active_requests[req].xlocks; - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup leftover xlock " << *dn << endl; - - mds->locker->dentry_xlock_finish(dn); - - // queue finishers - dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue); +void MDCache::request_cleanup(MDRequest *mdr) +{ + metareqid_t ri = mdr->reqid; - // remove clean, null dentry? (from a failed rename or whatever) - if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) { - dn->dir->remove_dentry(dn); - } - } - - assert(active_requests[req].xlocks.empty()); // we just finished finished them - } + // clear ref, trace + mdr->ref = 0; + mdr->trace.clear(); - // foreign xlocks? - if (active_requests[req].foreign_xlocks.size()) { - set dns = active_requests[req].foreign_xlocks; - active_requests[req].foreign_xlocks.clear(); - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl; - assert(dn->is_xlocked()); - int dauth = dn->dir->dentry_authority(dn->name); - MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, dauth, MDS_PORT_CACHE); - } - } + // drop locks + request_drop_locks(mdr); - // unpin paths - for (map< CDentry*, vector >::iterator it = active_requests[req].traces.begin(); - it != active_requests[req].traces.end(); - it++) { - path_unpin(it->second, req); - } - - // request pins - for (set::iterator it = active_requests[req].request_pins.begin(); - it != active_requests[req].request_pins.end(); - it++) { - (*it)->request_pin_put(); - } - for (set::iterator it = active_requests[req].request_dir_pins.begin(); - it != active_requests[req].request_dir_pins.end(); - it++) { - (*it)->request_pin_put(); - } + // drop auth pins + mdr->drop_auth_pins(); - // remove from map - active_requests.erase(req); + // drop cache pins + for (set::iterator it = mdr->pins.begin(); + it != mdr->pins.end(); + it++) + (*it)->put(MDSCacheObject::PIN_REQUEST); + mdr->pins.clear(); + // remove from map + active_requests.erase(mdr->reqid); + delete mdr; // log some stats ***** if (mds->logger) { @@ -2199,7 +3461,7 @@ void MDCache::request_cleanup(Message *req) if (g_conf.log_pins) { // pin /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], cinode_pins[i]); } @@ -2219,137 +3481,397 @@ for (int i=0; i_anchor_create_prepared(in, atid); + } +}; + +void MDCache::anchor_create(CInode *in, Context *onfinish) +{ + assert(in->is_auth()); + + // auth pin + if (!in->can_auth_pin()) { + dout(7) << "anchor_create not authpinnable, waiting on " << *in << endl; + in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); + return; + } + + // wait + in->add_waiter(CInode::WAIT_ANCHORED, onfinish); + + // already anchoring? + if (in->state_test(CInode::STATE_ANCHORING)) { + dout(7) << "anchor_create already anchoring " << *in << endl; + return; + } + + dout(7) << "anchor_create " << *in << endl; + + // auth: do it + in->state_set(CInode::STATE_ANCHORING); + in->get(CInode::PIN_ANCHORING); + in->auth_pin(); + + // make trace + vector trace; + in->make_anchor_trace(trace); + + // do it + C_MDC_AnchorCreatePrepared *fin = new C_MDC_AnchorCreatePrepared(this, in); + mds->anchorclient->prepare_create(in->ino(), trace, &fin->atid, fin); +} + +class C_MDC_AnchorCreateLogged : public Context { + MDCache *cache; + CInode *in; + version_t atid; + version_t pdv; +public: + C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, version_t v) : + cache(c), in(i), atid(t), pdv(v) {} + void finish(int r) { + cache->_anchor_create_logged(in, atid, pdv); + } +}; + +void MDCache::_anchor_create_prepared(CInode *in, version_t atid) { - dout(7) << "request_finish " << *req << endl; - request_cleanup(req); - delete req; // delete req - - if (mds->logger) mds->logger->inc("reply"); + dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << endl; + assert(in->inode.anchored == false); + + // predirty, prepare log entry + version_t pdv = in->pre_dirty(); + EUpdate *le = new EUpdate("anchor_create"); + le->metablob.add_dir_context(in->get_parent_dir()); - //dump(); + // update the logged inode copy + inode_t *pi = le->metablob.add_dentry(in->parent, true); + pi->anchored = true; + pi->version = pdv; + + // note anchor transaction + le->metablob.add_anchor_transaction(atid); + + // log + wait + mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, pdv)); } -void MDCache::request_forward(Message *req, int who, int port) +void MDCache::_anchor_create_logged(CInode *in, version_t atid, version_t pdv) { - if (!port) port = MDS_PORT_SERVER; + dout(10) << "_anchor_create_logged pdv " << pdv << " on " << *in << endl; - dout(7) << "request_forward to " << who << " req " << *req << endl; - request_cleanup(req); - mds->send_message_mds(req, who, port); + // unpin + assert(in->state_test(CInode::STATE_ANCHORING)); + in->state_clear(CInode::STATE_ANCHORING); + in->put(CInode::PIN_ANCHORING); + in->auth_unpin(); + + // apply update to cache + in->inode.anchored = true; + in->mark_dirty(pdv); + + // tell the anchortable we've committed + mds->anchorclient->commit(atid); - if (mds->logger) mds->logger->inc("fw"); + // trigger waiters + in->finish_waiting(CInode::WAIT_ANCHORED, 0); } +// DESTROY -// ANCHORS - -class C_MDC_AnchorInode : public Context { +class C_MDC_AnchorDestroyPrepared : public Context { + MDCache *cache; CInode *in; - public: - C_MDC_AnchorInode(CInode *in) { - this->in = in; - } + version_t atid; + C_MDC_AnchorDestroyPrepared(MDCache *c, CInode *i) : cache(c), in(i) {} void finish(int r) { - if (r == 0) { - assert(in->inode.anchored == false); - in->inode.anchored = true; - - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - - in->_mark_dirty(); // fixme - } - - // trigger - in->finish_waiting(CINODE_WAIT_ANCHORED, r); + cache->_anchor_destroy_prepared(in, atid); } }; -void MDCache::anchor_inode(CInode *in, Context *onfinish) +void MDCache::anchor_destroy(CInode *in, Context *onfinish) { assert(in->is_auth()); + // auth pin + if (!in->can_auth_pin()) { + dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << endl; + in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); + return; + } + + // wait + if (onfinish) + in->add_waiter(CInode::WAIT_UNANCHORED, onfinish); + // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_inode already anchoring " << *in << endl; + if (in->state_test(CInode::STATE_UNANCHORING)) { + dout(7) << "anchor_destroy already unanchoring " << *in << endl; + return; + } - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); + dout(7) << "anchor_destroy " << *in << endl; - } else { - dout(7) << "anchor_inode anchoring " << *in << endl; + // auth: do it + in->state_set(CInode::STATE_UNANCHORING); + in->get(CInode::PIN_UNANCHORING); + in->auth_pin(); + + // do it + C_MDC_AnchorDestroyPrepared *fin = new C_MDC_AnchorDestroyPrepared(this, in); + mds->anchorclient->prepare_destroy(in->ino(), &fin->atid, fin); +} - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - mds->anchorclient->create(in->ino(), trace, - new C_MDC_AnchorInode( in )); +class C_MDC_AnchorDestroyLogged : public Context { + MDCache *cache; + CInode *in; + version_t atid; + version_t pdv; +public: + C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, version_t v) : + cache(c), in(i), atid(t), pdv(v) {} + void finish(int r) { + cache->_anchor_destroy_logged(in, atid, pdv); } +}; + +void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) +{ + dout(10) << "_anchor_destroy_prepared " << *in << " atid " << atid << endl; + + assert(in->inode.anchored == true); + + // predirty, prepare log entry + version_t pdv = in->pre_dirty(); + + EUpdate *le = new EUpdate("anchor_destroy"); + le->metablob.add_dir_context(in->get_parent_dir()); + + // update the logged inode copy + inode_t *pi = le->metablob.add_dentry(in->parent, true); + pi->anchored = true; + pi->version = pdv; + + // note anchor transaction + le->metablob.add_anchor_transaction(atid); + + // log + wait + mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, pdv)); } -void MDCache::handle_inode_link(MInodeLink *m) +void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, version_t pdv) { - CInode *in = get_inode(m->get_ino()); + dout(10) << "_anchor_destroy_logged pdv " << pdv << " on " << *in << endl; + + // unpin + assert(in->state_test(CInode::STATE_UNANCHORING)); + in->state_clear(CInode::STATE_UNANCHORING); + in->put(CInode::PIN_UNANCHORING); + in->auth_unpin(); + + // apply update to cache + in->inode.anchored = false; + in->inode.version = pdv; + + // tell the anchortable we've committed + mds->anchorclient->commit(atid); + + // trigger waiters + in->finish_waiting(CInode::WAIT_UNANCHORED, 0); +} + + +// ------------------------------------------------------------------------------- +// STRAYS + +void MDCache::eval_stray(CDentry *dn) +{ + dout(10) << "eval_stray " << *dn << endl; + assert(dn->is_primary()); + CInode *in = dn->inode; assert(in); - if (!in->is_auth()) { - assert(in->is_proxy()); - dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); + // purge? + if (in->inode.nlink == 0) { + if (!dn->is_replicated() && !in->is_any_caps()) + _purge_stray(dn); return; } + else if (in->inode.nlink == 1) { + // trivial reintegrate? + if (!in->remote_parents.empty()) { + CDentry *rlink = *in->remote_parents.begin(); + if (rlink->is_auth() && + rlink->dir->can_auth_pin()) + reintegrate_stray(dn, rlink); + + if (!rlink->is_auth() && + !in->is_ambiguous_auth()) + migrate_stray(dn, rlink->authority().first); + } + } else { + // wait for next use. + } +} - dout(7) << "handle_inode_link on " << *in << endl; - if (!in->is_anchored()) { - assert(in->inode.nlink == 1); - dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl; - - anchor_inode(in, - new C_MDS_RetryMessage(mds, m)); - return; +class C_MDC_PurgeStray : public Context { + MDCache *cache; + CDentry *dn; + version_t pdv; +public: + C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v) : cache(c), dn(d), pdv(v) { } + void finish(int r) { + cache->_purge_stray_logged(dn, pdv); } +}; - in->inode.nlink++; - in->_mark_dirty(); // fixme +void MDCache::_purge_stray(CDentry *dn) +{ + dout(10) << "_purge_stray " << *dn << " " << *dn->inode << endl; + assert(!dn->is_replicated()); - // reply - dout(7) << " nlink++, now " << in->inode.nlink++ << endl; + // log removal + version_t pdv = dn->pre_dirty(); + + EUpdate *le = new EUpdate; + le->metablob.add_dir_context(dn->dir); + le->metablob.add_null_dentry(dn, true); + le->metablob.add_inode_truncate(dn->inode->inode, 0); + mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv)); +} + +void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv) +{ + dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << endl; + CInode *in = dn->inode; + + // dirty+unlink dentry + dn->dir->mark_dirty(pdv); + dn->dir->unlink_inode(dn); + dn->dir->remove_dentry(dn); + + // purge+remove inode + if (in->inode.size > 0) + purge_inode(&in->inode, 0); + remove_inode(in); +} + + + +void MDCache::reintegrate_stray(CDentry *dn, CDentry *rlink) +{ + dout(10) << "reintegrate_stray " << *dn << " into " << *rlink << endl; + +} + + +void MDCache::migrate_stray(CDentry *dn, int dest) +{ + dout(10) << "migrate_stray to mds" << dest << " " << *dn << endl; - mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE); - delete m; } -void MDCache::handle_inode_link_ack(MInodeLinkAck *m) +// ------------------------------------------------------------------------------- +// HARD LINKS + + +class C_MDC_InodeLinkAgree : public Context { + MDS *mds; + MInodeLink *m; +public: + C_MDC_InodeLinkAgree(MDS *_mds, MInodeLink *_m) : mds(_mds), m(_m) {} + void finish(int r) { + mds->send_message_mds(new MInodeLink(MInodeLink::OP_AGREE, + m->get_ino(), + m->get_inc(), + m->get_reqid()), + m->get_source().num(), + m->get_source_port()); + delete m; + } +}; + +void MDCache::handle_inode_link(MInodeLink *m) { CInode *in = get_inode(m->get_ino()); assert(in); + dout(7) << "handle_inode_link " << *m << " on " << *in << endl; + + // get request. + // we should have this bc the inode is xlocked. + MDRequest *mdr = request_get(m->get_reqid()); + + switch (m->get_op()) { + // auth + case MInodeLink::OP_PREPARE: + assert(in->is_auth()); + { + version_t pv = in->pre_dirty(); + ESlaveUpdate *le = new ESlaveUpdate("link_prepare", m->get_reqid(), 0); + le->metablob.add_dir_context(in->get_parent_dir()); + inode_t *pi = le->metablob.add_primary_dentry(in->parent, true, in); + if (m->get_inc()) + pi->nlink++; + else + pi->nlink--; + pi->ctime = m->get_ctime(); + pi->version = pv; + mdr->projected_inode[in->ino()] = *pi; + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_sync(new C_MDC_InodeLinkAgree(mds, m)); + } + return; + + case MInodeLink::OP_COMMIT: + assert(in->is_auth()); + { + // make the update to our cache + in->inode = mdr->projected_inode[in->ino()]; + in->mark_dirty(in->inode.version); + + // journal the commit + ESlaveUpdate *le = new ESlaveUpdate("link_commit", m->get_reqid(), 1); + mds->mdlog->submit_entry(le); + } + delete m; + return; - dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl; - in->finish_waiting(CINODE_WAIT_LINK, - m->is_success() ? 1:-1); + + case MInodeLink::OP_AGREE: + assert(!in->is_auth()); + in->finish_waiting(CInode::WAIT_SLAVEAGREE); + delete m; + return; + + default: + assert(0); + } } + + // REPLICAS @@ -2357,30 +3879,20 @@ void MDCache::handle_discover(MDiscover *dis) { int whoami = mds->get_nodeid(); - // from me to me? - if (dis->get_asker() == whoami) { - dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl; - delete dis; - return; - } + assert(dis->get_asker() != whoami); CInode *cur = 0; - MDiscoverReply *reply = 0; - //filepath fullpath; + MDiscoverReply *reply = new MDiscoverReply(dis->get_base_ino()); // get started. - if (dis->get_base_ino() == 0) { + if (dis->get_base_ino() == MDS_INO_ROOT) { // wants root - dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; + dout(7) << "handle_discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; assert(mds->get_nodeid() == 0); assert(root->is_auth()); - //fullpath = dis->get_want(); - - // add root - reply = new MDiscoverReply(0); reply->add_inode( root->replicate_to( dis->get_asker() ) ); dout(10) << "added root " << *root << endl; @@ -2389,181 +3901,170 @@ void MDCache::handle_discover(MDiscover *dis) } else { // there's a base inode cur = get_inode(dis->get_base_ino()); - assert(cur); - - if (dis->wants_base_dir()) { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl; - } else { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl; - } - assert(cur->is_dir()); - - // crazyness? - if (!cur->dir && !cur->is_auth()) { - int iauth = cur->authority(); - dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl; - mds->send_message_mds( dis, iauth, MDS_PORT_CACHE); + if (!cur) { + dout(7) << "handle_discover mds" << dis->get_asker() + << " don't have base ino " << dis->get_base_ino() + << ", dropping" << endl; + delete reply; return; } - // frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << "is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, dis)); - return; + if (dis->wants_base_dir()) { + dout(7) << "handle_discover mds" << dis->get_asker() + << " has " << *cur + << " wants basedir+" << dis->get_want().get_path() + << endl; + } else { + dout(7) << "handle_discover mds" << dis->get_asker() + << " has " << *cur + << " wants " << dis->get_want().get_path() + << endl; } - - if (!cur->dir) - cur->get_or_open_dir(this); - assert(cur->dir); - - dout(10) << "dir is " << *cur->dir << endl; - - // create reply - reply = new MDiscoverReply(cur->ino()); } assert(reply); assert(cur); - /* - // first traverse and make sure we won't have to do any waiting - dout(10) << "traversing full discover path = " << fullpath << endl; - vector trav; - int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL); - if (r > 0) - return; // fw or delay - dout(10) << "traverse finish w/o blocking, continuing" << endl; - // ok, now we know we won't block on dentry locks or readdir. - */ - - // add content // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "they don't want the base dir" << endl; - } else { - // is it actaully a dir at all? - if (!cur->is_dir()) { - dout(7) << "not a dir " << *cur << endl; - reply->set_flag_error_dir(); - break; - } + for (unsigned i = 0; + i < dis->get_want().depth() || dis->get_want().depth() == 0; + i++) { + + // -- figure out the dir + + // is *cur even a dir at all? + if (!cur->is_dir()) { + dout(7) << *cur << " not a dir" << endl; + reply->set_flag_error_dir(); + break; + } - // add dir - if (!cur->dir_is_auth()) { - dout(7) << *cur << " dir auth is someone else, i'm done" << endl; - break; + // pick frag + frag_t fg; + if (dis->get_want().depth()) { + // dentry specifies + fg = cur->pick_dirfrag(dis->get_dentry(i)); + } else { + // requester explicity specified the frag + fg = dis->get_base_dir_frag(); + assert(dis->wants_base_dir() || dis->get_base_ino() == MDS_INO_ROOT); + } + CDir *curdir = cur->get_dirfrag(fg); + + // am i dir auth (or if no dir, at least the inode auth) + if ((!curdir && !cur->is_auth()) || + (curdir && !curdir->is_auth())) { + if (curdir) { + dout(7) << *curdir << " not dirfrag auth, setting dir_auth_hint" << endl; + reply->set_dir_auth_hint(curdir->authority().first); + } else { + dout(7) << *cur << " dirfrag not open, not inode auth, setting dir_auth_hint" << endl; + reply->set_dir_auth_hint(cur->authority().first); } - // did we hit a frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << *cur << " is frozen_dir, stopping" << endl; - break; + // set hint (+ dentry, if there is one) + if (dis->get_want().depth() > i) + reply->set_error_dentry(dis->get_dentry(i)); + break; + } + + // open dir? + if (!curdir) + curdir = cur->get_or_open_dirfrag(this, fg); + assert(curdir); + assert(curdir->is_auth()); + + // is dir frozen? + if (curdir->is_frozen()) { + if (reply->is_empty()) { + dout(7) << *curdir << " is frozen, empty reply, waiting" << endl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + delete reply; + return; + } else { + dout(7) << *curdir << " is frozen, non-empty reply, stopping" << endl; + break; } - - if (!cur->dir) cur->get_or_open_dir(this); - - reply->add_dir( new CDirDiscover( cur->dir, - cur->dir->add_replica( dis->get_asker() ) ) ); - dout(7) << "added dir " << *cur->dir << endl; } - if (dis->get_want().depth() == 0) break; - // lookup dentry - int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) ); - if (dentry_auth != mds->get_nodeid()) { - dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl; - break; // that's it for us! + // add dir + if (reply->is_empty() && !dis->wants_base_dir()) { + dout(7) << "not adding unwanted base dir " << *curdir << endl; + } else { + assert(!curdir->is_ambiguous_auth()); // would be frozen. + reply->add_dir( curdir->replicate_to(dis->get_asker()) ); + dout(7) << "added dir " << *curdir << endl; } - - // get inode - CDentry *dn = cur->dir->lookup( dis->get_dentry(i) ); + if (dis->get_want().depth() == 0) break; - /* - if (dn && !dn->can_read()) { // xlocked? - dout(7) << "waiting on " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - dn->name, - new C_MDS_RetryMessage(mds, dis)); - return; + // lookup inode? + CDentry *dn = 0; + if (dis->get_want_ino()) { + CInode *in = get_inode(dis->get_want_ino()); + if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) + dn = in->get_parent_dn(); + } else { + // lookup dentry + dn = curdir->lookup( dis->get_dentry(i) ); } - */ - if (dn) { - if (!dn->inode && dn->is_sync()) { - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; // don't replicate null but non-locked dentries. + // incomplete dir? + if (!dn) { + if (!curdir->is_complete()) { + // readdir + dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << endl; + if (reply->is_empty()) { + // fetch and wait + curdir->fetch(new C_MDS_RetryMessage(mds, dis)); + return; + } else { + // initiate fetch, but send what we have so far + curdir->fetch(0); + break; + } } - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "added dentry " << *dn << endl; + // don't have wanted ino in this dir? + if (dis->get_want_ino()) { + // set error flag in reply + dout(7) << "ino " << dis->get_want_ino() << " in this dir, flagging error in " + << *curdir << endl; + reply->set_flag_error_ino(); + break; + } - if (!dn->inode) break; // we're done. + // send null dentry + dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " + << *curdir << endl; + dn = curdir->add_dentry(dis->get_dentry(i), 0); } + assert(dn); - if (dn && dn->inode) { - CInode *next = dn->inode; - assert(next->is_auth()); - - // add inode - //int nonce = next->cached_by_add(dis->get_asker()); - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl; - - // descend - cur = next; - } else { - // don't have inode? - if (cur->dir->is_complete()) { - // set error flag in reply - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; - } else { - // readdir - dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl; - - //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis)); - //break; // send what we have so far - - mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis)); - return; - } - } + // add dentry + reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); + dout(7) << "added dentry " << *dn << endl; + + if (!dn->is_primary()) break; // stop on null or remote link. + + // add inode + CInode *next = dn->inode; + assert(next->is_auth()); + + reply->add_inode( next->replicate_to( dis->get_asker() ) ); + dout(7) << "added inode " << *next << endl; + + // descend, keep going. + cur = next; + continue; } - - // how did we do. - if (reply->is_empty()) { - // discard empty reply + // how did we do? + if (reply->is_empty()) { + dout(7) << "dropping this empty reply)." << endl; delete reply; - - if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) && - !cur->dir->is_auth()) { - // fwd to dir auth - int dirauth = cur->dir->authority(); - if (dirauth == dis->get_asker()) { - dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right? - //assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it. - delete dis; - } else { - dout(7) << "fwd to dir auth " << dirauth << endl; - mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE ); - } - return; - } - - dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl; - //assert(0); - } else { - // send back to asker dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); } @@ -2576,46 +4077,50 @@ void MDCache::handle_discover(MDiscover *dis) void MDCache::handle_discover_reply(MDiscoverReply *m) { // starting point - CInode *cur; list finished, error; - if (m->has_root()) { - // nowhere! - dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; + // grab base inode + CInode *cur = get_inode(m->get_base_ino()); + + if (cur) { + dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; + } else { + if (!m->has_root()) { + dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; + delete m; + return; + } + + // it's the root inode. assert(!root); - assert(m->get_base_ino() == 0); + assert(m->get_base_ino() == MDS_INO_ROOT); assert(!m->has_base_dentry()); assert(!m->has_base_dir()); + dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; + // add in root cur = new CInode(this, false); - - m->get_inode(0).update_inode(cur); + m->get_inode(0).update_inode(cur); // that thar 0 is an array index (the 0th inode in the reply). // root set_root( cur ); add_inode( cur ); - dout(7) << " got root: " << *cur << endl; - - // take waiters - finished.swap(waiting_for_root); - } else { - // grab inode - cur = get_inode(m->get_base_ino()); - - if (!cur) { - dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; - delete m; - return; - } + dout(7) << "discover_reply got root " << *cur << endl; - dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; + // take root waiters + finished.swap(waiting_for_root); } // fyi if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; - dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl; + dout(10) << "depth = " << m->get_depth() + << ", has base_dir/base_dn/root = " + << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_root() + << ", num dirs/dentries/inodes = " + << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() + << endl; // loop over discover results. // indexese follow each ([[dir] dentry] inode) @@ -2624,72 +4129,67 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) for (int i=m->has_root(); iget_depth(); i++) { dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; + frag_t fg; + CDir *curdir = 0; + // dir if ((i > 0) || (i == 0 && m->has_base_dir())) { - if (cur->dir) { - // had it - /* this is strange, but it happens when: - we discover multiple dentries under a dir. - bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one). - this is actually good, since (dir aside) they're asking for different information. - */ - dout(7) << "had " << *cur->dir; - m->get_dir(i).update_dir(cur->dir); - dout2(7) << ", now " << *cur->dir << endl; - } else { - // add it (_replica_) - cur->set_dir( new CDir(cur, this, false) ); - m->get_dir(i).update_dir(cur->dir); - dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; + assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); + fg = m->get_dir(i).get_dirfrag().frag; - // get waiters - cur->take_waiting(CINODE_WAIT_DIR, finished); - } + // add/update the dir replica + curdir = add_replica_dir(cur, fg, m->get_dir(i), + m->get_source().num(), + finished); } - + // dentry error? if (i == m->get_depth()-1 && m->is_flag_error_dn()) { // error! assert(cur->is_dir()); - if (cur->dir) { + if (curdir) { dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl; - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_error_dentry(), - error); + curdir->take_dentry_waiting(m->get_error_dentry(), + error); } else { dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl; - cur->take_waiting(CINODE_WAIT_DIR, error); + cur->take_waiting(CInode::WAIT_DIR, error); + dir_discovers.erase(cur->ino()); } break; } - if (i >= m->get_num_dentries()) break; + if (i >= m->get_last_dentry()) break; // dentry dout(7) << "i = " << i << " dentry is " << m->get_dentry(i).get_dname() << endl; + if (!curdir) { + fg = cur->pick_dirfrag(m->get_dentry(i).get_dname()); + curdir = cur->get_dirfrag(fg); + } + assert(curdir); + CDentry *dn = 0; if (i > 0 || m->has_base_dentry()) { - dn = cur->dir->lookup( m->get_dentry(i).get_dname() ); + dn = curdir->lookup( m->get_dentry(i).get_dname() ); if (dn) { dout(7) << "had " << *dn << endl; - dn->replica_nonce = m->get_dentry(i).get_nonce(); // fix nonce. - } else { - dn = cur->dir->add_dentry( m->get_dentry(i).get_dname(), 0, false ); m->get_dentry(i).update_dentry(dn); + } else { + dn = curdir->add_dentry( m->get_dentry(i).get_dname(), 0, false ); + m->get_dentry(i).update_new_dentry(dn); dout(7) << "added " << *dn << endl; } - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_dentry(i).get_dname(), - finished); + curdir->take_dentry_waiting(m->get_dentry(i).get_dname(), finished); } - if (i >= m->get_num_inodes()) break; + if (i >= m->get_last_inode()) break; // inode dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl; @@ -2697,32 +4197,19 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) assert(dn); if (in) { - dout(7) << "had " << *in << endl; - - // fix nonce - dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl; + dout(7) << "had " << *in << ", new nonce " << m->get_inode(i).get_replica_nonce() << endl; in->replica_nonce = m->get_inode(i).get_replica_nonce(); - if (dn && in != dn->inode) { - dout(7) << " but it's not linked via dentry " << *dn << endl; - // link - if (dn->inode) { - dout(7) << "dentry WAS linked to " << *dn->inode << endl; - assert(0); // WTF. - } - dn->dir->link_inode(dn, in); - } + assert(in == dn->inode); // if we have it, it should be already linked to *dn. } else { - assert(dn->inode == 0); // better not be something else linked to this dentry... - // didn't have it. in = new CInode(this, false); - m->get_inode(i).update_inode(in); + add_inode( in ); // link in - add_inode( in ); + assert(dn->inode == 0); // better not be something else linked to this dentry. dn->dir->link_inode(dn, in); dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl; @@ -2732,25 +4219,93 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) cur = in; } - // dir error at the end there? - if (m->is_flag_error_dir()) { + // dir_auth hint? + if (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && + m->get_dir_auth_hint() != mds->get_nodeid()) { + dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << endl; + // let's try again. + int hint = m->get_dir_auth_hint(); + + // include any path fragment we were looking for at the time + filepath want; + if (m->get_error_dentry().length() > 0) + want.push_dentry(m->get_error_dentry()); + + mds->send_message_mds(new MDiscover(mds->get_nodeid(), + cur->ino(), + want, + true), // being conservative here. + hint, MDS_PORT_CACHE); + + // note the dangling discover + dir_discovers[cur->ino()].insert(hint); + } + else if (m->is_flag_error_dir()) { + // dir error at the end there? dout(7) << " flag_error on dir " << *cur << endl; assert(!cur->is_dir()); - cur->take_waiting(CINODE_WAIT_DIR, error); + cur->take_waiting(CInode::WAIT_DIR, error); + dir_discovers.erase(cur->ino()); } + // finish errors directly finish_contexts(error, -ENOENT); - mds->queue_finished(finished); + mds->queue_waiters(finished); // done delete m; } +CDir *MDCache::add_replica_dir(CInode *diri, + frag_t fg, CDirDiscover &dis, int from, + list& finished) +{ + // add it (_replica_) + CDir *dir = diri->get_dirfrag(fg); + + if (dir) { + // had replica. update w/ new nonce. + dis.update_dir(dir); + dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << endl; + } else { + // add replica. + dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); + dis.update_dir(dir); + + // is this a dir_auth delegation boundary? + if (from != diri->authority().first || + diri->is_ambiguous_auth() || + diri->ino() < MDS_INO_BASE) + adjust_subtree_auth(dir, from); + + dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << endl; + + // get waiters + diri->take_waiting(CInode::WAIT_DIR, finished); + dir_discovers.erase(diri->ino()); + } + + return dir; +} + +CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, int from) +{ + assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN); + + // forge a replica. + CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); + + // i'm assuming this is a subtree root. + adjust_subtree_auth(dir, from); + dout(7) << "forge_replica_dir added " << *dir << " while mds" << from << " is down" << endl; + return dir; +} + @@ -2802,192 +4357,7 @@ void MDCache::handle_inode_update(MInodeUpdate *m) -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - int source = m->get_source().num(); - map proxymap; - - if (m->get_from() == source) { - dout(7) << "cache_expire from mds" << from << endl; - } else { - dout(7) << "cache_expire from mds" << from << " via " << source << endl; - } - - // inodes - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << "inode expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(in); // i should be authority, or proxy .. and pinned - } - if (!in->is_auth()) { - int newauth = in->authority(); - dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl; - assert(newauth >= 0); - if (!in->state_test(CInode::STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; - assert(in->state_test(CInode::STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_inode(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - // my cache_expire, and the export_dir giving auth back to me crossed paths! - // we can ignore this. no danger of confusion since the two parties are both me. - dout(7) << "inode expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; - } - else if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; - inode_remove_replica(in, from); - - } - else { - // this is an old nonce, ignore expire. - dout(7) << "inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << endl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // dirs - for (map::iterator it = m->get_dirs().begin(); - it != m->get_dirs().end(); - it++) { - CInode *diri = get_inode(it->first); - assert(diri); - CDir *dir = diri->dir; - int nonce = it->second; - - if (!dir) { - dout(0) << "dir expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(dir); // i should be authority, or proxy ... and pinned - } - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dir(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - dout(7) << "dir expire on " << *dir << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replicas << endl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << "dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << endl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // dentries - for (map >::iterator pd = m->get_dentries().begin(); - pd != m->get_dentries().end(); - ++pd) { - dout(0) << "dn expires in dir " << pd->first << endl; - CInode *diri = get_inode(pd->first); - CDir *dir = diri->dir; - assert(dir); - - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dentry expires on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) - dout(0) << "nonproxy dentry expires? " << *dir << " .. auth is " << newauth - << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dentries(pd->first, pd->second); - continue; - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - - CDentry *dn = dir->lookup(p->first); - if (!dn) - dout(0) << "missing dentry for " << p->first << " in " << *dir << endl; - assert(dn); - - if (from == mds->get_nodeid()) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dn->get_replica_nonce(from)) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from << endl; - dn->remove_replica(from); - } - else { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << endl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - - // send proxy forwards - for (map::iterator it = proxymap.begin(); - it != proxymap.end(); - it++) { - dout(7) << "sending proxy forward to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } - - // done - delete m; -} -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->hardlock.is_gathering(from)) { - in->hardlock.gather_set.erase(from); - if (in->hardlock.gather_set.size() == 0) - mds->locker->inode_hard_eval(in); - } - if (in->filelock.is_gathering(from)) { - in->filelock.gather_set.erase(from); - if (in->filelock.gather_set.size() == 0) - mds->locker->inode_file_eval(in); - } - - // alone now? - if (!in->is_replicated()) { - mds->locker->inode_hard_eval(in); - mds->locker->inode_file_eval(in); - } -} int MDCache::send_dir_updates(CDir *dir, bool bcast) @@ -3017,7 +4387,7 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast) //if (*it == except) continue; dout(7) << "sending dir_update on " << *dir << " to " << *it << endl; - mds->send_message_mds(new MDirUpdate(dir->ino(), + mds->send_message_mds(new MDirUpdate(dir->dirfrag(), dir->dir_rep, dir->dir_rep_by, path, @@ -3031,9 +4401,9 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast) void MDCache::handle_dir_update(MDirUpdate *m) { - CInode *in = get_inode(m->get_ino()); - if (!in || !in->dir) { - dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl; + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << endl; // discover it? if (m->should_discover()) { @@ -3043,29 +4413,31 @@ void MDCache::handle_dir_update(MDirUpdate *m) dout(5) << "trying discover on dir_update for " << path << endl; - int r = path_traverse(path, trace, true, + int r = path_traverse(0, 0, + path, trace, true, m, new C_MDS_RetryMessage(mds, m), MDS_TRAVERSE_DISCOVER); if (r > 0) return; - if (r == 0) { - assert(in); - open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); - return; - } - assert(0); + assert(r == 0); + + CInode *in = get_inode(m->get_dirfrag().ino); + assert(in); + open_remote_dir(in, m->get_dirfrag().frag, + new C_MDS_RetryMessage(mds, m)); + return; } - goto out; + delete m; + return; } // update - dout(5) << "dir_update on " << *in->dir << endl; - in->dir->dir_rep = m->get_dir_rep(); - in->dir->dir_rep_by = m->get_dir_rep_by(); + dout(5) << "dir_update on " << *dir << endl; + dir->dir_rep = m->get_dir_rep(); + dir->dir_rep_by = m->get_dir_rep_by(); // done - out: delete m; } @@ -3073,175 +4445,15 @@ void MDCache::handle_dir_update(MDirUpdate *m) -class C_MDC_DentryUnlink : public Context { -public: - MDCache *mdc; - CDentry *dn; - CDir *dir; - Context *c; - C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) { - this->mdc = mdc; - this->dn = dn; - this->dir = dir; - this->c = c; - } - void finish(int r) { - assert(r == 0); - mdc->dentry_unlink_finish(dn, dir, c); - } -}; - - -// NAMESPACE FUN - -void MDCache::dentry_unlink(CDentry *dn, Context *c) -{ - CDir *dir = dn->dir; - string dname = dn->name; - - assert(dn->lockstate == DN_LOCK_XLOCK); - - // i need the inode to do any of this properly - assert(dn->inode); - - // log it - if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME - mds->mdlog->submit_entry(new EString("unlink fixme fixme"),//EUnlink(dir, dn, dn->inode), - NULL); // FIXME FIXME FIXME - - // tell replicas - if (dir->is_replicated()) { - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - dout(7) << "inode_unlink sending DentryUnlink to mds" << it->first << endl; - - mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), it->first, MDS_PORT_CACHE); - } - - // don't need ack. - } - - - // inode deleted? - if (dn->is_primary()) { - assert(dn->inode->is_auth()); - dn->inode->inode.nlink--; - - if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs - - // last link? - if (dn->inode->inode.nlink == 0) { - // truly dangling - if (dn->inode->dir) { - // mark dir clean too, since it now dne! - assert(dn->inode->dir->is_auth()); - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - dn->inode->dir->mark_clean(); - } - - // mark it clean, it's dead - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - - } else { - // migrate to inode file - dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl; - - // dangling but still linked. - assert(dn->inode->is_anchored()); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // mark it dirty! - in->_mark_dirty(); // fixme - - // update anchor to point to inode file+mds - vector atrace; - in->make_anchor_trace(atrace); - assert(atrace.size() == 1); // it's dangling - mds->anchorclient->update(in->ino(), atrace, - new C_MDC_DentryUnlink(this, dn, dir, c)); - return; - } - } - else if (dn->is_remote()) { - // need to dec nlink on primary - if (dn->inode->is_auth()) { - // awesome, i can do it - dout(7) << "remote target is local, nlink--" << endl; - dn->inode->inode.nlink--; - dn->inode->_mark_dirty(); // fixme - - if (( dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 0) || - (!dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 1)) { - dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(dn->inode->ino(), NULL); - } - } else { - int auth = dn->inode->authority(); - dout(7) << "remote target is remote, sending unlink request to " << auth << endl; - - mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()), - auth, MDS_PORT_CACHE); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // add waiter - in->add_waiter(CINODE_WAIT_UNLINK, c); - return; - } - } - else - assert(0); // unlink on null dentry?? - - // unlink locally - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // finish! - dentry_unlink_finish(dn, dir, c); -} - - -void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c) -{ - dout(7) << "dentry_unlink_finish on " << *dn << endl; - string dname = dn->name; - - // unpin dir / unxlock - mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking - - // did i empty out an imported dir? - if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0) - migrator->export_empty_import(dir); - - // wake up any waiters - dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue); - - c->finish(0); -} - - +// UNLINK void MDCache::handle_dentry_unlink(MDentryUnlink *m) { - CInode *diri = get_inode(m->get_dirino()); - CDir *dir = 0; - if (diri) dir = diri->dir; + CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!diri || !dir) { - dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl; + if (!dir) { + dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << endl; } else { CDentry *dn = dir->lookup(m->get_dn()); @@ -3250,22 +4462,44 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) } else { dout(7) << "handle_dentry_unlink on " << *dn << endl; - // dir? - if (dn->inode) { - if (dn->inode->dir) { - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - } + // move to stray? + CDentry *straydn = 0; + if (m->strayin) { + // inode + CInode *in = get_inode(MDS_INO_STRAY(m->get_source().num())); + if (!in) { + in = new CInode(this, false); + m->strayin->update_inode(in); + add_inode(in); + } else { + m->strayin->update_inode(in); + } + + // dirfrag + list finished; + CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, + m->get_source().num(), finished); + if (!finished.empty()) mds->queue_waiters(finished); + + // dentry + straydn = dir->add_dentry( m->straydn->get_dname(), 0, false ); + m->straydn->update_new_dentry(straydn); } + + // open inode? + if (dn->is_primary()) { + CInode *in = dn->inode; + dn->dir->unlink_inode(dn); + assert(straydn); + straydn->dir->link_inode(straydn, in); + } else { + assert(dn->is_remote()); + dn->dir->unlink_inode(dn); + } + assert(dn->is_null()); - string dname = dn->name; - - // unlink - dn->dir->remove_dentry(dn); - - // wake up - //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); - dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); + // move to bottom of lru + lru.lru_bottouch(dn); } } @@ -3274,263 +4508,181 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) } -void MDCache::handle_inode_unlink(MInodeUnlink *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - // proxy? - if (in->is_proxy()) { - dout(7) << "handle_inode_unlink proxy on " << *in << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); - return; - } - assert(in->is_auth()); - - // do it. - dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl; - assert(in->inode.nlink > 0); - in->inode.nlink--; - - if (in->state_test(CInode::STATE_DANGLING)) { - // already dangling. - // last link? - if (in->inode.nlink == 0) { - dout(7) << "last link, marking clean and removing anchor" << endl; - - in->mark_clean(); // mark it clean. - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - else { - in->_mark_dirty(); // fixme - } - } else { - // has primary link still. - assert(in->inode.nlink >= 1); - in->_mark_dirty(); // fixme - - if (in->inode.nlink == 1) { - dout(7) << "nlink=1, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - } - - // ack - mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE); -} - -void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_inode_unlink_ack on " << *in << endl; - in->finish_waiting(CINODE_WAIT_UNLINK, 0); -} - - - - -/* - * some import/export helpers - */ - -/** con = get_auth_container(dir) - * Returns the directory in which authority is delegated for *dir. - * This may be because a directory is an import, or because it is hashed - * and we are nested underneath an inode in that dir (that hashes to us). - * Thus do not assume result->is_auth()! It is_auth() || is_hashed(). - */ -CDir *MDCache::get_auth_container(CDir *dir) -{ - CDir *imp = dir; // might be *dir - - // find the underlying import or hash that delegates dir - while (true) { - if (imp->is_import()) break; // import - imp = imp->get_parent_dir(); - if (!imp) break; // none - if (imp->is_hashed()) break; // hash - } - - return imp; -} - -CDir *MDCache::get_export_container(CDir *dir) -{ - CDir *ex = dir; // might be *dir - assert(!ex->is_auth()); - - // find the underlying import or hash that delegates dir away - while (true) { - if (ex->is_export()) break; // import - ex = ex->get_parent_dir(); - assert(ex); - if (ex->is_hashed()) break; // hash - } - - return ex; -} +// ============================================================== +// debug crap -void MDCache::find_nested_exports(CDir *dir, set& s) +void MDCache::show_subtrees(int dbl) { - CDir *import = get_auth_container(dir); - find_nested_exports_under(import, dir, s); -} + //dout(10) << "show_subtrees" << endl; -void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) -{ - dout(10) << "find_nested_exports for " << *dir << endl; - dout(10) << "find_nested_exports_under import " << *import << endl; + if (dbl > g_conf.debug && dbl > g_conf.debug_mds) + return; // i won't print anything. - if (import == dir) { - // yay, my job is easy! - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - } + if (subtrees.empty()) { + dout(dbl) << "no subtrees" << endl; return; } - // ok, my job is annoying. - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - - dout(12) << "find_nested_exports checking " << *nested << endl; - - // trace back to import, or dir - CDir *cur = nested->get_parent_dir(); - while (!cur->is_import() || cur == dir) { - if (cur == dir) { - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - break; - } else { - cur = cur->get_parent_dir(); - } + // root frags + list rootfrags; + if (root) root->get_dirfrags(rootfrags); + if (stray) stray->get_dirfrags(rootfrags); + + // queue stuff + list > q; + string indent; + set seen; + + // calc depth + for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + q.push_back(pair(*p, 0)); + + int depth = 0; + while (!q.empty()) { + CDir *dir = q.front().first; + int d = q.front().second; + q.pop_front(); + + if (d > depth) depth = d; + + // sanity check + if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << endl; + assert(seen.count(dir) == 0); + seen.insert(dir); + + // nested items? + if (!subtrees[dir].empty()) { + for (set::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + q.push_front(pair(*p, d+1)); } } -} - - - - + // print tree + for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + q.push_back(pair(*p, 0)); + while (!q.empty()) { + CDir *dir = q.front().first; + int d = q.front().second; + q.pop_front(); + // adjust indenter + while ((unsigned)d < indent.size()) + indent.resize(d); + + // pad + string pad = "______________________________________"; + pad.resize(depth*2+1-indent.size()); + if (!subtrees[dir].empty()) + pad[0] = '.'; // parent + string auth; + if (dir->is_auth()) + auth = "auth "; + else + auth = " rep "; - - - - - - - -// ============================================================== -// debug crap - - -void MDCache::show_imports() -{ - int db = 10; - - if (imports.empty() && - hashdirs.empty()) { - dout(db) << "show_imports: no imports/exports/hashdirs" << endl; - return; + char s[10]; + if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) + sprintf(s, "%2d ", dir->get_dir_auth().first); + else + sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); + + // print + dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << endl; + + if (dir->ino() == MDS_INO_ROOT) + assert(dir->inode == root); + if (dir->ino() == MDS_INO_STRAY(mds->get_nodeid())) + assert(dir->inode == stray); + + // nested items? + if (!subtrees[dir].empty()) { + // more at my level? + if (!q.empty() && q.front().second == d) + indent += "| "; + else + indent += " "; + + for (set::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + q.push_front(pair(*p, d+2)); + } } - dout(db) << "show_imports:" << endl; +} - set ecopy = exports; - set::iterator it = hashdirs.begin(); - while (1) { - if (it == hashdirs.end()) it = imports.begin(); - if (it == imports.end() ) break; - - CDir *im = *it; +void MDCache::show_cache() +{ + dout(7) << "show_cache" << endl; + + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it++) { + // unlinked? + if (!it->second->parent) + dout(7) << " unlinked " << *it->second << endl; - if (im->is_import()) { - dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - //assert( im->is_auth() ); - } - else if (im->is_hashed()) { - if (im->is_import()) continue; // if import AND hash, list as import. - dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - } - - for (set::iterator p = nested_exports[im].begin(); - p != nested_exports[im].end(); - p++) { - CDir *exp = *p; - if (exp->is_hashed()) { - //assert(0); // we don't do it this way actually - dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - //assert( !exp->is_auth() ); - } else { - dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - assert( exp->is_export() ); - //assert( !exp->is_auth() ); - } - - if ( get_auth_container(exp) != im ) { - dout(1) << "uh oh, auth container is " << *get_auth_container(exp) << endl; - assert( get_auth_container(exp) == im ); - } - - if (ecopy.count(exp) != 1) { - dout(1) << "***** nested_export " << *exp << " not in exports" << endl; - assert(0); + // dirfrags? + list dfs; + it->second->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *dir = *p; + dout(7) << " dirfrag " << *dir << endl; + + for (CDir_map_t::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + CDentry *dn = p->second; + dout(7) << " dentry " << *dn << endl; + if (dn->is_primary() && dn->inode) + dout(7) << " inode " << *dn->inode << endl; } - ecopy.erase(exp); } - - it++; - } - - if (ecopy.size()) { - for (set::iterator it = ecopy.begin(); - it != ecopy.end(); - it++) - dout(1) << "***** stray item in exports: " << **it << endl; - assert(ecopy.size() == 0); } } -void MDCache::show_cache() +void MDCache::dump_cache() { - dout(7) << "show_cache" << endl; + char fn[20]; + sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); + dout(1) << "dump_cache to " << fn << endl; + + ofstream myfile; + myfile.open(fn); + for (hash_map::iterator it = inode_map.begin(); it != inode_map.end(); it++) { - dout(7) << *((*it).second) << endl; - - CDentry *dn = (*it).second->get_parent_dn(); - if (dn) - dout(7) << " dn " << *dn << endl; - if ((*it).second->dir) - dout(7) << " subdir " << *(*it).second->dir << endl; + list dfs; + it->second->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *dir = *p; + myfile << *dir->inode << endl; + myfile << *dir << endl; + + for (CDir_map_t::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + CDentry *dn = p->second; + myfile << *dn << endl; + } + } } -} + myfile.close(); +} diff --git a/trunk/ceph/mds/MDCache.h b/trunk/ceph/mds/MDCache.h index 7b8825f073726..4d42ea6ea3634 100644 --- a/trunk/ceph/mds/MDCache.h +++ b/trunk/ceph/mds/MDCache.h @@ -28,8 +28,7 @@ #include "CInode.h" #include "CDentry.h" #include "CDir.h" -#include "Lock.h" - +#include "include/Context.h" class MDS; class Migrator; @@ -49,7 +48,7 @@ class MDirUpdate; class MDentryUnlink; class MLock; - +class Message; class MClientRequest; @@ -64,22 +63,83 @@ class MClientRequest; * mostly information about locks held, so that we can drop them all * the request is finished or forwarded. see request_*(). */ -typedef struct { - CInode *ref; // reference inode - set< CInode* > request_pins; - set< CDir* > request_dir_pins; - map< CDentry*, vector > traces; // path pins held - set< CDentry* > xlocks; // xlocks (local) - set< CDentry* > foreign_xlocks; // xlocks on foreign hosts -} active_request_t; - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Message *p) const { - static hash H; - return H((unsigned long)p); +struct MDRequest { + metareqid_t reqid; + Message *request; // MClientRequest, or MLock + int by_mds; // if MLock, and remote xlock attempt + + vector trace; // original path traversal. + CInode *ref; // reference inode. if there is only one, and its path is pinned. + + // cache pins (so things don't expire) + set< MDSCacheObject* > pins; + + // auth pins + set< CDir* > dir_auth_pins; + set< CInode* > inode_auth_pins; + + // held locks + set< SimpleLock* > rdlocks; // always local. + set< SimpleLock* > wrlocks; // always local. + set< SimpleLock* > xlocks; // local or remote. + set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering + + // projected updates + map< inodeno_t, inode_t > projected_inode; + + + // --------------------------------------------------- + MDRequest() : request(0), by_mds(-1), ref(0) {} + MDRequest(metareqid_t ri, Message *req=0) : reqid(ri), request(req), by_mds(-1), ref(0) {} + + // request + MClientRequest *client_request() { + return (MClientRequest*)request; + } + + // pin items in cache + void pin(MDSCacheObject *o) { + if (pins.count(o) == 0) { + o->get(MDSCacheObject::PIN_REQUEST); + pins.insert(o); + } + } + + // auth pins + bool is_auth_pinned(CInode *in) { return inode_auth_pins.count(in); } + bool is_auth_pinned(CDir *dir) { return dir_auth_pins.count(dir); } + void auth_pin(CInode *in) { + if (!is_auth_pinned(in)) { + in->auth_pin(); + inode_auth_pins.insert(in); + } + } + void auth_pin(CDir *dir) { + if (!is_auth_pinned(dir)) { + dir->auth_pin(); + dir_auth_pins.insert(dir); } - }; + } + void drop_auth_pins() { + for (set::iterator it = inode_auth_pins.begin(); + it != inode_auth_pins.end(); + it++) + (*it)->auth_unpin(); + inode_auth_pins.clear(); + for (set::iterator it = dir_auth_pins.begin(); + it != dir_auth_pins.end(); + it++) + (*it)->auth_unpin(); + dir_auth_pins.clear(); + } +}; + +inline ostream& operator<<(ostream& out, MDRequest &mdr) +{ + out << "request(" << mdr.reqid; + //if (mdr.request) out << " " << *mdr.request; + out << ")"; + return out; } class MDCache { @@ -92,88 +152,149 @@ class MDCache { protected: // the cache CInode *root; // root inode - hash_map inode_map; // map of inodes by ino + hash_map inode_map; // map of inodes by ino + CInode *stray; // my stray dir + + // root + list waiting_for_root; + map > waiting_for_stray; + +public: + int get_num_inodes() { return inode_map.size(); } + int get_num_dentries() { return lru.lru_get_size(); } + + + // -- subtrees -- +protected: + map > subtrees; // nested bounds on subtrees. - list inode_expire_queue; // inodes to delete + // adjust subtree auth specification + // dir->dir_auth + // imports/exports/nested_exports + // join/split subtrees as appropriate +public: + bool is_subtrees() { return !subtrees.empty(); } + void adjust_subtree_auth(CDir *root, pair auth); + void adjust_subtree_auth(CDir *root, int a, int b=CDIR_AUTH_UNKNOWN) { + adjust_subtree_auth(root, pair(a,b)); + } + void adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth); + void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { + adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); + } + void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); + void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { + adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); + } + void adjust_export_state(CDir *dir); + void try_subtree_merge(CDir *root); + void try_subtree_merge_at(CDir *root); + CDir *get_subtree_root(CDir *dir); + void remove_subtree(CDir *dir); + void get_subtree_bounds(CDir *root, set& bounds); + void get_wouldbe_subtree_bounds(CDir *root, set& bounds); + void verify_subtree_bounds(CDir *root, const set& bounds); + void verify_subtree_bounds(CDir *root, const list& bounds); + void adjust_subtree_after_rename(CInode *diri, CDir *olddir); - // root - list waiting_for_root; + void get_auth_subtrees(set& s); + void get_fullauth_subtrees(set& s); + + int num_subtrees(); + int num_subtrees_fullauth(); + int num_subtrees_fullnonauth(); - // imports, exports, and hashes. - set imports; // includes root (on mds0) - set exports; - set hashdirs; - map > nested_exports; // exports nested under imports _or_ hashdirs - void adjust_export(int to, CDir *root, set& bounds); - void adjust_import(int from, CDir *root, set& bounds); +protected: + // delayed cache expire + map > delayed_expire; // import|export dir -> expire msg + + // -- discover -- + hash_map > dir_discovers; // dirino -> mds set i'm trying to discover. + + + // -- requests -- +public: + +protected: + hash_map active_requests; +public: + MDRequest* request_start(metareqid_t rid); + MDRequest* request_start(MClientRequest *req); + MDRequest* request_start(MLock *req); + MDRequest* request_get(metareqid_t rid); + void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); + void request_finish(MDRequest *mdr); + void request_forward(MDRequest *mdr, int mds, int port=0); + void dispatch_request(MDRequest *mdr); + void request_drop_locks(MDRequest *mdr); + void request_cleanup(MDRequest *r); - // active MDS requests - hash_map active_requests; - - // inode purging - map purging; - map > waiting_for_purge; + // inode purging + map > purging; + map > > waiting_for_purge; + // shutdown crap int shutdown_commits; - bool did_shutdown_exports; bool did_shutdown_log_cap; friend class C_MDC_ShutdownCommit; - // recovery + // -- recovery -- protected: + set recovery_set; + // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; + map > my_ambiguous_imports; // from MMDSImportMaps - map > > other_ambiguous_imports; + map > > other_ambiguous_imports; - set recovery_set; set wants_import_map; // nodes i need to send my import map to - set got_import_map; // nodes i need to send my import map to (when exports finish) - set rejoin_ack_gather; // nodes i need a rejoin ack from + set got_import_map; // nodes i got import_maps from void handle_import_map(MMDSImportMap *m); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_ack(MMDSCacheRejoinAck *m); void disambiguate_imports(); + + set rejoin_gather; // nodes from whom i need a rejoin + set rejoin_ack_gather; // nodes from whom i need a rejoin ack + set want_rejoin_ack; // nodes to whom i need to send a rejoin ack + void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); + void handle_cache_rejoin(MMDSCacheRejoin *m); + void handle_cache_rejoin_rejoin(MMDSCacheRejoin *m); + void handle_cache_rejoin_ack(MMDSCacheRejoin *m); + void handle_cache_rejoin_missing(MMDSCacheRejoin *m); + void handle_cache_rejoin_full(MMDSCacheRejoin *m); void send_cache_rejoin_acks(); + void recalc_auth_bits(); + public: + void set_recovery_set(set& s); + void handle_mds_failure(int who); + void handle_mds_recovery(int who); void send_import_map(int who); void send_import_map_now(int who); - void send_import_map_later(int who) { - wants_import_map.insert(who); - } + void send_import_map_later(int who); void send_pending_import_maps(); // maybe. void send_cache_rejoins(); + void log_import_map(Context *onsync=0); - void set_recovery_set(set& s) { - recovery_set = s; - } // ambiguous imports - void add_ambiguous_import(inodeno_t base, set& bounds) { - my_ambiguous_imports[base].swap(bounds); - } - void cancel_ambiguous_import(inodeno_t dirino); - void finish_ambiguous_import(inodeno_t dirino); - - void finish_ambiguous_export(inodeno_t dirino, set& bounds); - + void add_ambiguous_import(dirfrag_t base, list& bounds); + void add_ambiguous_import(CDir *base, const set& bounds); + void cancel_ambiguous_import(dirfrag_t dirino); + void finish_ambiguous_import(dirfrag_t dirino); - - friend class CInode; friend class Locker; friend class Migrator; friend class Renamer; friend class MDBalancer; - friend class EImportMap; public: @@ -192,19 +313,16 @@ public: // root inode CInode *get_root() { return root; } void set_root(CInode *r); + CInode *get_stray() { return stray; } - int get_num_imports() { return imports.size(); } - void add_import(CDir *dir); - void remove_import(CDir *dir); - void recalc_auth_bits(); - - void log_import_map(Context *onsync=0); - - // cache void set_cache_size(size_t max) { lru.lru_set_max(max); } size_t get_cache_size() { return lru.lru_get_size(); } bool trim(int max = -1); // trim cache + void trim_dirfrag(CDir *dir, CDir *con, + map& expiremap); + void trim_inode(CDentry *dn, CInode *in, CDir *con, + map& expiremap); void trim_non_auth(); // trim out trimmable non-auth items // shutdown @@ -217,10 +335,16 @@ public: bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } CInode* get_inode( inodeno_t ino ) { if (have_inode(ino)) - return inode_map[ ino ]; + return inode_map[ino]; return NULL; } - + CDir* get_dir(inodeno_t dirino) { // deprecated + return get_dirfrag(dirfrag_t(dirino, frag_t())); + } + CDir* get_dirfrag(dirfrag_t df) { + if (!have_inode(df.ino)) return NULL; + return inode_map[df.ino]->get_dirfrag(df.frag); + } int hash_dentry(inodeno_t ino, const string& s) { return 0; // fixme @@ -231,9 +355,8 @@ public: CInode *create_inode(); void add_inode(CInode *in); - protected: void remove_inode(CInode *in); - void destroy_inode(CInode *in); + protected: void touch_inode(CInode *in) { if (in->get_parent_dn()) touch_dentry(in->get_parent_dn()); @@ -256,10 +379,18 @@ public: public: // inode purging - void purge_inode(inode_t& inode); - void purge_inode_finish(inodeno_t ino); - void purge_inode_finish_2(inodeno_t ino); - void waitfor_purge(inodeno_t ino, Context *c); + void purge_inode(inode_t *inode, off_t newsize); + void purge_inode_finish(inodeno_t ino, off_t newsize); + void purge_inode_finish_2(inodeno_t ino, off_t newsize); + bool is_purging(inodeno_t ino, off_t newsize) { + return purging.count(ino) && purging[ino].count(newsize); + } + void wait_for_purge(inodeno_t ino, off_t newsize, Context *c) { + waiting_for_purge[ino][newsize].push_back(c); + } + + void add_recovered_purge(const inode_t& inode, off_t newsize); + void remove_recovered_purge(inodeno_t ino, off_t newsize); void start_recovered_purges(); @@ -269,40 +400,53 @@ public: void find_nested_exports(CDir *dir, set& s); void find_nested_exports_under(CDir *import, CDir *dir, set& s); - public: CInode *create_root_inode(); - int open_root(Context *c); - int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, + void open_root(Context *c); + CInode *create_stray_inode(int whose=-1); + void open_local_stray(); + void open_foreign_stray(int who, Context *c); + int path_traverse(MDRequest *mdr, + CInode *base, + filepath& path, vector& trace, bool follow_trailing_sym, Message *req, Context *ondelay, - int onfail, - Context *onfinish=0, - bool is_client_req = false); - void open_remote_dir(CInode *diri, Context *fin); - void open_remote_ino(inodeno_t ino, Message *req, Context *fin); - void open_remote_ino_2(inodeno_t ino, Message *req, - vector& anchortrace, + int onfail, + bool is_client_req = false, + bool null_okay = false); + void open_remote_dir(CInode *diri, frag_t fg, Context *fin); + CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); + void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); + void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, + vector& anchortrace, Context *onfinish); - bool path_pin(vector& trace, Message *m, Context *c); - void path_unpin(vector& trace, Message *m); void make_trace(vector& trace, CInode *in); - bool request_start(Message *req, - CInode *ref, - vector& trace); - void request_cleanup(Message *req); - void request_finish(Message *req); - void request_forward(Message *req, int mds, int port=0); - void request_pin_inode(Message *req, CInode *in); - void request_pin_dir(Message *req, CDir *dir); - - // anchors - void anchor_inode(CInode *in, Context *onfinish); - //void unanchor_inode(CInode *in, Context *c); + // -- anchors -- +public: + void anchor_create(CInode *in, Context *onfinish); + void anchor_destroy(CInode *in, Context *onfinish); +protected: + void _anchor_create_prepared(CInode *in, version_t atid); + void _anchor_create_logged(CInode *in, version_t atid, version_t pdv); + void _anchor_destroy_prepared(CInode *in, version_t atid); + void _anchor_destroy_logged(CInode *in, version_t atid, version_t pdv); + + friend class C_MDC_AnchorCreatePrepared; + friend class C_MDC_AnchorCreateLogged; + friend class C_MDC_AnchorDestroyPrepared; + friend class C_MDC_AnchorDestroyLogged; + + // -- stray -- +public: + void eval_stray(CDentry *dn); +protected: + void _purge_stray(CDentry *dn); + void _purge_stray_logged(CDentry *dn, version_t pdv); + friend class C_MDC_PurgeStray; + void reintegrate_stray(CDentry *dn, CDentry *rlink); + void migrate_stray(CDentry *dn, int dest); - void handle_inode_link(class MInodeLink *m); - void handle_inode_link_ack(class MInodeLinkAck *m); // == messages == public: @@ -313,29 +457,18 @@ public: void handle_discover(MDiscover *dis); void handle_discover_reply(MDiscoverReply *m); + CDir* add_replica_dir(CInode *diri, + frag_t fg, CDirDiscover& dis, + int from, + list& finished); + CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); + + + // -- hard links -- + void handle_inode_link(class MInodeLink *m); // -- namespace -- - // these handle logging, cache sync themselves. - // UNLINK - public: - void dentry_unlink(CDentry *in, Context *c); - protected: - void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c); void handle_dentry_unlink(MDentryUnlink *m); - void handle_inode_unlink(class MInodeUnlink *m); - void handle_inode_unlink_ack(class MInodeUnlinkAck *m); - friend class C_MDC_DentryUnlink; - - - - // -- misc auth -- - int ino_proxy_auth(inodeno_t ino, - int frommds, - map >& inomap); - void do_ino_proxy(CInode *in, Message *m); - void do_dir_proxy(CDir *dir, Message *m); - - // -- updates -- @@ -345,20 +478,36 @@ public: int send_dir_updates(CDir *in, bool bcast=false); void handle_dir_update(MDirUpdate *m); + // -- cache expiration -- void handle_cache_expire(MCacheExpire *m); - + void process_delayed_expire(CDir *dir); + void discard_delayed_expire(CDir *dir); // == crap fns == public: - void dump() { - if (root) root->dump(); - } - - void show_imports(); void show_cache(); + void dump_cache(); + void show_subtrees(int dbl=10); + + CInode *hack_pick_random_inode() { + assert(!inode_map.empty()); + int n = rand() % inode_map.size(); + hash_map::iterator p = inode_map.begin(); + while (n--) p++; + return p->second; + } }; +class C_MDS_RetryRequest : public Context { + MDCache *cache; + MDRequest *mdr; + public: + C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} + virtual void finish(int r) { + cache->dispatch_request(mdr); + } +}; #endif diff --git a/trunk/ceph/mds/MDLog.cc b/trunk/ceph/mds/MDLog.cc index ba2011e092b08..87a68882b4181 100644 --- a/trunk/ceph/mds/MDLog.cc +++ b/trunk/ceph/mds/MDLog.cc @@ -30,7 +30,8 @@ LogType mdlog_logtype; -MDLog::MDLog(MDS *m) +/* +MDLog::MDLog(MDS *m) : replay_thread(this) { mds = m; num_events = 0; @@ -49,6 +50,7 @@ MDLog::MDLog(MDS *m) journaler = 0; logger = 0; } +*/ MDLog::~MDLog() @@ -133,8 +135,7 @@ off_t MDLog::get_write_pos() -void MDLog::submit_entry( LogEvent *le, - Context *c ) +void MDLog::submit_entry( LogEvent *le, Context *c ) { if (g_conf.mds_log) { dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; @@ -242,8 +243,14 @@ void MDLog::_did_read() void MDLog::_trimmed(LogEvent *le) { + // successful trim? + if (!le->has_expired(mds)) { + dout(7) << "retrimming : " << le->get_start_off() << " : " << *le << endl; + le->expire(mds, new C_MDL_Trimmed(this, le)); + return; + } + dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl; - assert(le->has_expired(mds)); if (trimming.begin()->first == le->_end_off) { // we trimmed off the front! @@ -347,6 +354,8 @@ void MDLog::trim(Context *c) } + + void MDLog::replay(Context *c) { assert(journaler->is_active()); @@ -374,18 +383,95 @@ void MDLog::replay(Context *c) assert(num_events == 0); - _replay(); + replay_thread.create(); + //_replay(); } class C_MDL_Replay : public Context { MDLog *mdlog; public: C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { mdlog->_replay(); } + void finish(int r) { + mdlog->replay_cond.Signal(); + //mdlog->_replay(); + } }; + + +// i am a separate thread +void MDLog::_replay_thread() +{ + mds->mds_lock.Lock(); + dout(10) << "_replay_thread start" << endl; + + // loop + while (1) { + // wait for read? + while (!journaler->is_readable() && + journaler->get_read_pos() < journaler->get_write_pos()) { + journaler->wait_for_readable(new C_MDL_Replay(this)); + replay_cond.Wait(mds->mds_lock); + } + + if (!journaler->is_readable() && + journaler->get_read_pos() == journaler->get_write_pos()) + break; + + assert(journaler->is_readable()); + + // read it + off_t pos = journaler->get_read_pos(); + bufferlist bl; + bool r = journaler->try_read_entry(bl); + assert(r); + + // unpack event + LogEvent *le = LogEvent::decode(bl); + num_events++; + + // have we seen an import map yet? + if (!seen_import_map && + le->get_type() != EVENT_IMPORTMAP) { + dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() + << " -- waiting for import_map. (skipping " << *le << ")" << endl; + } else { + dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() + << " : " << *le << endl; + le->replay(mds); + + if (le->get_type() == EVENT_IMPORTMAP) + seen_import_map = true; + } + delete le; + + // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off + mds->mds_lock.Unlock(); + mds->mds_lock.Lock(); + } + + // done! + assert(journaler->get_read_pos() == journaler->get_write_pos()); + dout(10) << "_replay - complete" << endl; + + // move read pointer _back_ to expire pos, for eventual trimming + journaler->set_read_pos(journaler->get_expire_pos()); + + // kick waiter(s) + list ls; + ls.swap(waitfor_replay); + finish_contexts(ls,0); + + dout(10) << "_replay_thread finish" << endl; + mds->mds_lock.Unlock(); +} + + + void MDLog::_replay() { + mds->mds_lock.Lock(); + // read what's buffered while (journaler->is_readable() && journaler->get_read_pos() < journaler->get_write_pos()) { @@ -413,6 +499,10 @@ void MDLog::_replay() seen_import_map = true; } delete le; + + // drop lock for a second, so other events (e.g. beacon timer!) can go off + mds->mds_lock.Unlock(); + mds->mds_lock.Lock(); } // wait for read? diff --git a/trunk/ceph/mds/MDLog.h b/trunk/ceph/mds/MDLog.h index 384b72d02a4ff..114d6c630749f 100644 --- a/trunk/ceph/mds/MDLog.h +++ b/trunk/ceph/mds/MDLog.h @@ -18,6 +18,9 @@ #include "include/types.h" #include "include/Context.h" +#include "common/Thread.h" +#include "common/Cond.h" + #include //#include @@ -53,6 +56,9 @@ class MDLog { inode_t log_inode; Journaler *journaler; + Logger *logger; + + // -- trimming -- map trimming; std::list trim_waiters; // contexts waiting for trim bool trim_reading; @@ -60,11 +66,32 @@ class MDLog { bool waiting_for_read; friend class C_MDL_Reading; - Logger *logger; - + + + // -- replay -- + Cond replay_cond; + + class ReplayThread : public Thread { + MDLog *log; + public: + ReplayThread(MDLog *l) : log(l) {} + void* entry() { + log->_replay_thread(); + return 0; + } + } replay_thread; + + friend class ReplayThread; + friend class C_MDL_Replay; + list waitfor_replay; - // importmaps + void _replay(); // old way + void _replay_thread(); // new way + + + + // -- importmaps -- off_t last_import_map; // offsets of last committed importmap. constrains trimming. list import_map_expire_waiters; bool writing_import_map; // one is being written now @@ -75,16 +102,30 @@ class MDLog { friend class MDCache; void init_journaler(); + public: + void add_import_map_expire_waiter(Context *c) { + import_map_expire_waiters.push_back(c); + } + - public: // replay state map > pending_exports; public: - MDLog(MDS *m); + MDLog(MDS *m) : mds(m), + num_events(0), max_events(g_conf.mds_log_max_len), + unflushed(0), + capped(false), + journaler(0), + logger(0), + trim_reading(false), waiting_for_read(false), + replay_thread(this), + last_import_map(0), + writing_import_map(false), seen_import_map(false) { + } ~MDLog(); @@ -122,7 +163,6 @@ class MDLog { void write_head(Context *onfinish); void replay(Context *onfinish); - void _replay(); }; #endif diff --git a/trunk/ceph/mds/MDS.cc b/trunk/ceph/mds/MDS.cc index 6d66b77cd95f2..fb62840da6830 100644 --- a/trunk/ceph/mds/MDS.cc +++ b/trunk/ceph/mds/MDS.cc @@ -28,12 +28,11 @@ #include "Server.h" #include "Locker.h" #include "MDCache.h" -#include "MDStore.h" #include "MDLog.h" #include "MDBalancer.h" #include "IdAllocator.h" #include "Migrator.h" -#include "Renamer.h" +//#include "Renamer.h" #include "AnchorTable.h" #include "AnchorClient.h" @@ -43,6 +42,8 @@ #include "common/Timer.h" +#include "events/EClientMap.h" + #include "messages/MMDSMap.h" #include "messages/MMDSBeacon.h" @@ -53,6 +54,9 @@ #include "messages/MOSDMap.h" #include "messages/MOSDGetMap.h" +#include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" + LogType mds_logtype, mds_cache_logtype; @@ -79,18 +83,20 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { filer = new Filer(objecter); mdcache = new MDCache(this); - mdstore = new MDStore(this); mdlog = new MDLog(this); balancer = new MDBalancer(this); - anchorclient = new AnchorClient(messenger, mdsmap); + anchorclient = new AnchorClient(this); idalloc = new IdAllocator(this); - anchormgr = new AnchorTable(this); + anchortable = new AnchorTable(this); server = new Server(this); locker = new Locker(this, mdcache); + + // clients + last_client_mdsmap_bcast = 0; // beacon beacon_last_seq = 0; @@ -104,7 +110,6 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { want_state = state = MDSMap::STATE_DNE; - logger = logger2 = 0; // i'm ready! @@ -113,11 +118,10 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { MDS::~MDS() { if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdstore) { delete mdstore; mdstore = NULL; } if (mdlog) { delete mdlog; mdlog = NULL; } if (balancer) { delete balancer; balancer = NULL; } if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchormgr) { delete anchormgr; anchormgr = NULL; } + if (anchortable) { delete anchortable; anchortable = NULL; } if (anchorclient) { delete anchorclient; anchorclient = NULL; } if (osdmap) { delete osdmap; osdmap = 0; } if (mdsmap) { delete mdsmap; mdsmap = 0; } @@ -214,6 +218,27 @@ void MDS::send_message_mds(Message *m, int mds, int port, int fromport) messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); } +void MDS::forward_message_mds(Message *req, int mds, int port) +{ + // client request? + if (req->get_type() == MSG_CLIENT_REQUEST) { + MClientRequest *creq = (MClientRequest*)req; + creq->inc_num_fwd(); // inc forward counter + + // tell the client where it should go + messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd()), + creq->get_client_inst()); + + if (!creq->is_idempotent()) { + delete req; + return; // don't actually forward if non-idempotent + } + } + + // forward + send_message_mds(req, mds, port); +} + class C_MDS_Tick : public Context { MDS *mds; @@ -416,7 +441,7 @@ void MDS::beacon_kill(utime_t lab) void MDS::handle_mds_map(MMDSMap *m) { version_t epoch = m->get_epoch(); - dout(1) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; + dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; // note source's map version if (m->get_source().is_mds() && @@ -429,7 +454,7 @@ void MDS::handle_mds_map(MMDSMap *m) // is it new? if (epoch <= mdsmap->get_epoch()) { - dout(1) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() + dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() << ", discarding" << endl; delete m; return; @@ -443,10 +468,16 @@ void MDS::handle_mds_map(MMDSMap *m) bool wasrejoining = mdsmap->is_rejoining(); set oldfailed; mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); + set oldactive; + mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); + set oldcreating; + mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING); + set oldout; + mdsmap->get_mds_set(oldout, MDSMap::STATE_OUT); // decode and process mdsmap->decode(m->get_encoded()); - + // see who i am whoami = mdsmap->get_inst_rank(messenger->get_myaddr()); if (oldwhoami != whoami) { @@ -474,6 +505,10 @@ void MDS::handle_mds_map(MMDSMap *m) objecter->set_client_incarnation(mdsmap->get_inc(whoami)); } + // for debug + if (g_conf.mds_dump_cache_on_map) + mdcache->dump_cache(); + // update my state state = mdsmap->get_state(whoami); @@ -486,14 +521,45 @@ void MDS::handle_mds_map(MMDSMap *m) << ", although i wanted " << mdsmap->get_state_name(want_state) << endl; want_state = state; + } + + // contemplate suicide + if (mdsmap->get_inst(whoami) != messenger->get_myinst()) { + dout(1) << "apparently i've been replaced by " << mdsmap->get_inst(whoami) << ", committing suicide." << endl; + exit(-1); + } + if (mdsmap->is_down(whoami)) { + dout(1) << "apparently i'm down. committing suicide." << endl; + exit(-1); } // now active? if (is_active()) { + // did i just recover? + if (oldstate == MDSMap::STATE_REJOIN) { + dout(1) << "successful recovery!" << endl; + + // kick anchortable (resent AGREEs) + if (mdsmap->get_anchortable() == whoami) + anchortable->finish_recovery(); + + // kick anchorclient (resent COMMITs) + anchorclient->finish_recovery(); + + mdcache->start_recovered_purges(); + + // tell connected clients + bcast_mds_map(); + } + dout(1) << "now active" << endl; finish_contexts(waitfor_active); // kick waiters } + else if (is_reconnect()) { + server->reconnect_clients(); + } + else if (is_replay()) { // initialize gather sets set rs; @@ -508,14 +574,11 @@ void MDS::handle_mds_map(MMDSMap *m) assert(oldstate == MDSMap::STATE_ACTIVE); dout(1) << "now stopping" << endl; + // start cache shutdown mdcache->shutdown_start(); - - // save anchor table - if (mdsmap->get_anchortable() == whoami) - anchormgr->save(0); // FIXME? or detect completion via filer? - - if (idalloc) - idalloc->save(0); // FIXME? or detect completion via filer? + + // terminate client sessions + server->terminate_sessions(); // flush log mdlog->set_max_events(0); @@ -531,25 +594,60 @@ void MDS::handle_mds_map(MMDSMap *m) } - // is anyone resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { + // RESOLVE + // am i newly resolving? + if (is_resolve() && oldstate == MDSMap::STATE_REPLAY) { + // send to all resolve, active, stopping + dout(10) << "i am newly resolving, sharing import map" << endl; + set who; + mdsmap->get_mds_set(who, MDSMap::STATE_RESOLVE); + mdsmap->get_mds_set(who, MDSMap::STATE_ACTIVE); + mdsmap->get_mds_set(who, MDSMap::STATE_STOPPING); + mdsmap->get_mds_set(who, MDSMap::STATE_REJOIN); // hrm. FIXME. + for (set::iterator p = who.begin(); p != who.end(); ++p) { + if (*p == whoami) continue; + mdcache->send_import_map(*p); // now. + } + } + // is someone else newly resolving? + else if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { set resolve; mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) + if (oldresolve != resolve) { dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p) == 0 || // if other guy newly resolve, or - oldstate == MDSMap::STATE_REPLAY) // if i'm newly resolve, - mdcache->send_import_map(*p); // share my import map (now or later) + for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { + if (*p == whoami) continue; + if (oldresolve.count(*p)) continue; + mdcache->send_import_map(*p); // now or later. + } } } + // REJOIN // is everybody finally rejoining? if (is_rejoin() || is_active() || is_stopping()) { + // did we start? if (!wasrejoining && mdsmap->is_rejoining()) { mdcache->send_cache_rejoins(); } + // did we finish? + if (wasrejoining && !mdsmap->is_rejoining()) { + mdcache->dump_cache(); + } + } + + // did someone go active? + if (is_active() || is_stopping()) { + set active; + mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); + for (set::iterator p = active.begin(); p != active.end(); ++p) { + if (*p == whoami) continue; // not me + if (oldactive.count(*p)) continue; // newly so? + mdcache->handle_mds_recovery(*p); + if (anchortable) + anchortable->handle_mds_recovery(*p); + anchorclient->handle_mds_recovery(*p); + } } // did anyone go down? @@ -560,29 +658,42 @@ void MDS::handle_mds_map(MMDSMap *m) // newly so? if (oldfailed.count(*p)) continue; - mdcache->migrator->handle_mds_failure(*p); + mdcache->handle_mds_failure(*p); } } + // inst set changed? + /* + if (state >= MDSMap::STATE_ACTIVE && // only if i'm active+. otherwise they'll get map during reconnect. + mdsmap->get_same_inst_since() > last_client_mdsmap_bcast) { + bcast_mds_map(); + } + */ + delete m; } +void MDS::bcast_mds_map() +{ + dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << endl; + + // share the map with mounted clients + for (set::const_iterator p = clientmap.get_session_set().begin(); + p != clientmap.get_session_set().end(); + ++p) { + messenger->send_message(new MMDSMap(mdsmap), + clientmap.get_inst(*p)); + } + last_client_mdsmap_bcast = mdsmap->get_epoch(); +} + + void MDS::handle_osd_map(MOSDMap *m) { version_t had = osdmap->get_epoch(); dout(10) << "handle_osd_map had " << had << endl; - // pass on to clients - for (set::iterator it = clientmap.get_mount_set().begin(); - it != clientmap.get_mount_set().end(); - it++) { - MOSDMap *n = new MOSDMap; - n->maps = m->maps; - n->incremental_maps = m->incremental_maps; - messenger->send_message(n, clientmap.get_inst(*it)); - } - // process locally objecter->handle_osd_map(m); @@ -622,12 +733,23 @@ void MDS::boot_create() assert(root); // force empty root dir - CDir *dir = root->dir; + CDir *dir = root->get_dirfrag(frag_t()); dir->mark_complete(); dir->mark_dirty(dir->pre_dirty()); // save it - mdstore->commit_dir(dir, fin->new_sub()); + dir->commit(0, fin->new_sub()); + } + + // create my stray dir + { + dout(10) << "boot_create creating local stray dir" << endl; + mdcache->open_local_stray(); + CInode *stray = mdcache->get_stray(); + CDir *dir = stray->get_dirfrag(frag_t()); + dir->mark_complete(); + dir->mark_dirty(dir->pre_dirty()); + dir->commit(0, fin->new_sub()); } // start with a fresh journal @@ -646,8 +768,8 @@ void MDS::boot_create() // fixme: fake out anchortable if (mdsmap->get_anchortable() == whoami) { dout(10) << "boot_create creating fresh anchortable" << endl; - anchormgr->reset(); - anchormgr->save(fin->new_sub()); + anchortable->create_fresh(); + anchortable->save(fin->new_sub()); } } @@ -662,7 +784,7 @@ void MDS::boot_start() if (mdsmap->get_anchortable() == whoami) { dout(2) << "boot_start opening anchor table" << endl; - anchormgr->load(fin->new_sub()); + anchortable->load(fin->new_sub()); } else { dout(2) << "boot_start i have no anchor table" << endl; } @@ -674,6 +796,9 @@ void MDS::boot_start() dout(2) << "boot_start opening root directory" << endl; mdcache->open_root(fin->new_sub()); } + + dout(2) << "boot_start opening local stray directory" << endl; + mdcache->open_local_stray(); } void MDS::boot_finish() @@ -711,7 +836,7 @@ void MDS::boot_replay(int step) case 2: if (mdsmap->get_anchortable() == whoami) { dout(2) << "boot_replay " << step << ": opening anchor table" << endl; - anchormgr->load(new C_MDS_BootRecover(this, 3)); + anchortable->load(new C_MDS_BootRecover(this, 3)); break; } dout(2) << "boot_replay " << step << ": i have no anchor table" << endl; @@ -728,21 +853,10 @@ void MDS::boot_replay(int step) break; case 5: - dout(2) << "boot_replay " << step << ": restarting any recovered purges" << endl; - mdcache->start_recovered_purges(); - - step++; // fall-thru - - case 6: // done with replay! - if (mdsmap->get_num_mds(MDSMap::STATE_ACTIVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_STOPPING) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_RESOLVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REJOIN) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REPLAY) == 1 && // me - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { - dout(2) << "boot_replay " << step << ": i am alone, moving to state active" << endl; - set_want_state(MDSMap::STATE_ACTIVE); + if (mdsmap->get_num_in_mds() == 1) { // me + dout(2) << "boot_replay " << step << ": i am alone, moving to state reconnect" << endl; + set_want_state(MDSMap::STATE_RECONNECT); } else { dout(2) << "boot_replay " << step << ": i am not alone, moving to state resolve" << endl; set_want_state(MDSMap::STATE_RESOLVE); @@ -855,10 +969,11 @@ void MDS::my_dispatch(Message *m) if (m->get_source().is_mds()) { int from = m->get_source().num(); if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst()) { + mdsmap->get_inst(from) != m->get_source_inst() || + mdsmap->is_down(from)) { // bogus mds? if (m->get_type() != MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() << ", dropping" << endl; delete m; return; @@ -872,8 +987,8 @@ void MDS::my_dispatch(Message *m) switch (m->get_dest_port()) { - case MDS_PORT_ANCHORMGR: - anchormgr->dispatch(m); + case MDS_PORT_ANCHORTABLE: + anchortable->dispatch(m); break; case MDS_PORT_ANCHORCLIENT: anchorclient->dispatch(m); @@ -890,7 +1005,7 @@ void MDS::my_dispatch(Message *m) mdcache->migrator->dispatch(m); break; case MDS_PORT_RENAMER: - mdcache->renamer->dispatch(m); + //mdcache->renamer->dispatch(m); break; case MDS_PORT_BALANCER: @@ -909,6 +1024,16 @@ void MDS::my_dispatch(Message *m) dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl; assert(0); } + + // finish any triggered contexts + if (finished_queue.size()) { + dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; + dout(10) << finished_queue << endl; + list ls; + ls.splice(ls.begin(), finished_queue); + assert(finished_queue.empty()); + finish_contexts(ls); + } // HACK FOR NOW @@ -919,19 +1044,40 @@ void MDS::my_dispatch(Message *m) // trim cache mdcache->trim(); } + - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); + // hack: thrash exports + for (int i=0; i s; + mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); + if (s.size() < 2 || mdcache->get_num_inodes() < 10) + break; // need peers for this to work. + + dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << endl; + + // pick a random dir inode + CInode *in = mdcache->hack_pick_random_inode(); + + list ls; + in->get_dirfrags(ls); + if (ls.empty()) continue; // must be an open dir. + CDir *dir = ls.front(); + if (!dir->get_parent_dir()) continue; // must be linked. + if (!dir->is_auth()) continue; // must be auth. + + int dest; + do { + int k = rand() % s.size(); + set::iterator p = s.begin(); + while (k--) p++; + dest = *p; + } while (dest == whoami); + mdcache->migrator->export_dir(dir,dest); } - // hack: force hash root? + /* if (false && mdcache->get_root() && mdcache->get_root()->dir && @@ -940,12 +1086,13 @@ void MDS::my_dispatch(Message *m) dout(0) << "hashing root" << endl; mdcache->migrator->hash_dir(mdcache->get_root()->dir); } - + */ // HACK to force export to test foreign renames if (false && whoami == 0) { + /* static bool didit = false; // 7 to 1 @@ -958,6 +1105,7 @@ void MDS::my_dispatch(Message *m) didit = true; } } + */ } @@ -1017,6 +1165,19 @@ void MDS::proc_message(Message *m) +void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) +{ + mds_lock.Lock(); + dout(10) << "handle_ms_failure to " << inst << " on " << *m << endl; + + if (m->get_type() == MSG_CLIENT_RECONNECT) + server->client_reconnect_failure(m->get_dest().num()); + + delete m; + mds_lock.Unlock(); +} + + @@ -1030,3 +1191,33 @@ void MDS::handle_ping(MPing *m) delete m; } + + + +class C_LogClientmap : public Context { + ClientMap *clientmap; + version_t cmapv; +public: + C_LogClientmap(ClientMap *cm, version_t v) : + clientmap(cm), cmapv(v) {} + void finish(int r) { + clientmap->set_committed(cmapv); + list ls; + clientmap->take_commit_waiters(cmapv, ls); + finish_contexts(ls); + } +}; + +void MDS::log_clientmap(Context *c) +{ + dout(10) << "log_clientmap " << clientmap.get_version() << endl; + + bufferlist bl; + clientmap.encode(bl); + + clientmap.set_committing(clientmap.get_version()); + clientmap.add_commit_waiter(c); + + mdlog->submit_entry(new EClientMap(bl, clientmap.get_version()), + new C_LogClientmap(&clientmap, clientmap.get_version())); +} diff --git a/trunk/ceph/mds/MDS.h b/trunk/ceph/mds/MDS.h index 8b3ff1e4aa430..d7645c322a518 100644 --- a/trunk/ceph/mds/MDS.h +++ b/trunk/ceph/mds/MDS.h @@ -43,31 +43,6 @@ using namespace __gnu_cxx; #include "ClientMap.h" -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 - -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORMGR 11 - - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_LOG_OFFSET 0x100 -#define MDS_INO_IDS_OFFSET 0x200 -#define MDS_INO_INODEFILE_OFFSET 0x300 -#define MDS_INO_ANCHORTABLE 0x400 -#define MDS_INO_BASE 0x1000 - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 class filepath; @@ -81,7 +56,6 @@ class Locker; class AnchorTable; class AnchorClient; class MDCache; -class MDStore; class MDLog; class MDBalancer; class IdAllocator; @@ -118,19 +92,16 @@ class MDS : public Dispatcher { Objecter *objecter; Filer *filer; // for reading/writing to/from osds - ClientMap clientmap; - // sub systems Server *server; MDCache *mdcache; Locker *locker; - MDStore *mdstore; MDLog *mdlog; MDBalancer *balancer; IdAllocator *idalloc; - AnchorTable *anchormgr; + AnchorTable *anchortable; AnchorClient *anchorclient; Logger *logger, *logger2; @@ -155,6 +126,7 @@ class MDS : public Dispatcher { bool is_standby() { return state == MDSMap::STATE_STANDBY; } bool is_replay() { return state == MDSMap::STATE_REPLAY; } bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } + bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; } bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } bool is_active() { return state == MDSMap::STATE_ACTIVE; } bool is_stopping() { return state == MDSMap::STATE_STOPPING; } @@ -166,10 +138,10 @@ class MDS : public Dispatcher { // -- waiters -- list finished_queue; - void queue_finished(Context *c) { + void queue_waiter(Context *c) { finished_queue.push_back(c); } - void queue_finished(list& ls) { + void queue_waiters(list& ls) { finished_queue.splice( finished_queue.end(), ls ); } @@ -184,7 +156,11 @@ class MDS : public Dispatcher { Context *tick_event; void reset_tick(); - + // -- client map -- + ClientMap clientmap; + epoch_t last_client_mdsmap_bcast; + void log_clientmap(Context *c); + // shutdown crap int req_rate; @@ -194,10 +170,6 @@ class MDS : public Dispatcher { int get_req_rate() { return req_rate; } - protected: - - friend class MDStore; - public: MDS(int whoami, Messenger *m, MonMap *mm); @@ -209,6 +181,8 @@ class MDS : public Dispatcher { OSDMap *get_osd_map() { return osdmap; } void send_message_mds(Message *m, int mds, int port=0, int fromport=0); + void forward_message_mds(Message *req, int mds, int port=0); + // start up, shutdown int init(bool standby=false); @@ -219,6 +193,8 @@ class MDS : public Dispatcher { void boot_replay(int step=0); // i am recovering existing (down:failed) mds. void boot_finish(); + void bcast_mds_map(); // to mounted clients + int shutdown_start(); int shutdown_final(); @@ -235,11 +211,11 @@ class MDS : public Dispatcher { virtual void dispatch(Message *m); void my_dispatch(Message *m); + void ms_handle_failure(Message *m, const entity_inst_t& inst); + // special message types void handle_ping(class MPing *m); - void handle_mds_map(class MMDSMap *m); - void handle_shutdown_start(Message *m); // osds diff --git a/trunk/ceph/mds/MDSMap.h b/trunk/ceph/mds/MDSMap.h index 66b086e5ea39f..1b7dcd7a95379 100644 --- a/trunk/ceph/mds/MDSMap.h +++ b/trunk/ceph/mds/MDSMap.h @@ -28,36 +28,38 @@ using namespace std; class MDSMap { public: // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_OUT = 1; // down, once existed, but no imports, empty log. - static const int STATE_FAILED = 2; // down, holds (er, held) metadata; needs to be recovered. - - static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor. - static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..) - static const int STATE_STARTING = 5; // up, starting prior out MDS instance. - static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state - static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?) - static const int STATE_REJOIN = 8; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 9; // up, active - static const int STATE_STOPPING = 10; // up, exporting metadata (-> standby or out) - static const int STATE_STOPPED = 11; // up, finished stopping. like standby, but not avail to takeover. + static const int STATE_DNE = 0; // down, never existed. + static const int STATE_OUT = 1; // down, once existed, but no subtrees, empty log. + static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered. + + static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor. + static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..) + static const int STATE_STARTING = 5; // up, starting prior out MDS instance. + static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state + static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?) + static const int STATE_RECONNECT = 8; // up, reconnect to clients + static const int STATE_REJOIN = 9; // up, replayed journal, rejoining distributed cache + static const int STATE_ACTIVE = 10; // up, active + static const int STATE_STOPPING = 11; // up, exporting metadata (-> standby or out) + static const int STATE_STOPPED = 12; // up, finished stopping. like standby, but not avail to takeover. static const char *get_state_name(int s) { switch (s) { // down - case STATE_DNE: return "down:dne"; - case STATE_OUT: return "down:out"; - case STATE_FAILED: return "down:failed"; + case STATE_DNE: return "down:dne"; + case STATE_OUT: return "down:out"; + case STATE_FAILED: return "down:failed"; // up - case STATE_STANDBY: return "up:standby"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - case STATE_STOPPED: return "up:stopped"; + case STATE_STANDBY: return "up:standby"; + case STATE_CREATING: return "up:creating"; + case STATE_STARTING: return "up:starting"; + case STATE_REPLAY: return "up:replay"; + case STATE_RESOLVE: return "up:resolve"; + case STATE_RECONNECT: return "up:reconnect"; + case STATE_REJOIN: return "up:rejoin"; + case STATE_ACTIVE: return "up:active"; + case STATE_STOPPING: return "up:stopping"; + case STATE_STOPPED: return "up:stopped"; default: assert(0); } return 0; @@ -65,7 +67,8 @@ class MDSMap { protected: epoch_t epoch; - utime_t ctime; + utime_t created; + epoch_t same_inst_since; int anchortable; // which MDS has anchortable (fixme someday) int root; // which MDS has root directory @@ -79,12 +82,13 @@ class MDSMap { friend class MDSMonitor; public: - MDSMap() : epoch(0), anchortable(0), root(0) {} + MDSMap() : epoch(0), same_inst_since(0), anchortable(0), root(0) {} epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } - const utime_t& get_ctime() const { return ctime; } + const utime_t& get_create() const { return created; } + epoch_t get_same_inst_since() const { return same_inst_since; } int get_anchortable() const { return anchortable; } int get_root() const { return root; } @@ -99,6 +103,9 @@ class MDSMap { if (p->second == state) ++n; return n; } + int get_num_in_mds() { + return get_num_up_mds() - get_num_mds(STATE_STANDBY) - get_num_mds(STATE_STOPPED); + } int get_num_up_mds() { int n = 0; for (map::const_iterator p = mds_state.begin(); @@ -119,14 +126,12 @@ class MDSMap { // sets void get_mds_set(set& s) { - s.clear(); for (map::const_iterator p = mds_state.begin(); p != mds_state.end(); p++) s.insert(p->first); } void get_up_mds_set(set& s) { - s.clear(); for (map::const_iterator p = mds_state.begin(); p != mds_state.end(); p++) @@ -134,7 +139,6 @@ class MDSMap { s.insert(p->first); } void get_mds_set(set& s, int state) { - s.clear(); for (map::const_iterator p = mds_state.begin(); p != mds_state.end(); p++) @@ -148,13 +152,11 @@ class MDSMap { get_mds_set(s, MDSMap::STATE_FAILED); } void get_recovery_mds_set(set& s) { - s.clear(); for (map::const_iterator p = mds_state.begin(); p != mds_state.end(); p++) if (is_failed(p->first) || - is_replay(p->first) || is_resolve(p->first) || is_rejoin(p->first) || - is_active(p->first) || is_stopping(p->first)) + (p->second >= STATE_REPLAY && p->second <= STATE_STOPPING)) s.insert(p->first); } @@ -172,9 +174,11 @@ class MDSMap { bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } + bool is_reconnect(int m) { return mds_state.count(m) && mds_state[m] == STATE_RECONNECT; } bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } + bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } bool has_created(int m) { return mds_created.count(m); } @@ -183,6 +187,7 @@ class MDSMap { bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. return get_num_mds(STATE_REPLAY) + get_num_mds(STATE_RESOLVE) + + get_num_mds(STATE_RECONNECT) + get_num_mds(STATE_REJOIN) + get_num_mds(STATE_FAILED); } @@ -192,6 +197,7 @@ class MDSMap { bool is_rejoining() { // nodes are rejoining cache state return get_num_mds(STATE_REJOIN) > 0 && + get_num_mds(STATE_RECONNECT) == 0 && get_num_mds(STATE_RESOLVE) == 0 && get_num_mds(STATE_REPLAY) == 0 && get_num_mds(STATE_FAILED) == 0; @@ -252,7 +258,8 @@ class MDSMap { // serialize, unserialize void encode(bufferlist& blist) { blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&ctime, sizeof(ctime)); + blist.append((char*)&created, sizeof(created)); + blist.append((char*)&same_inst_since, sizeof(same_inst_since)); blist.append((char*)&anchortable, sizeof(anchortable)); blist.append((char*)&root, sizeof(root)); @@ -266,8 +273,10 @@ class MDSMap { int off = 0; blist.copy(off, sizeof(epoch), (char*)&epoch); off += sizeof(epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); + blist.copy(off, sizeof(created), (char*)&created); + off += sizeof(created); + blist.copy(off, sizeof(same_inst_since), (char*)&same_inst_since); + off += sizeof(same_inst_since); blist.copy(off, sizeof(anchortable), (char*)&anchortable); off += sizeof(anchortable); blist.copy(off, sizeof(root), (char*)&root); diff --git a/trunk/ceph/mds/MDStore.cc b/trunk/ceph/mds/MDStore.cc deleted file mode 100644 index 13aa270a2ee6c..0000000000000 --- a/trunk/ceph/mds/MDStore.cc +++ /dev/null @@ -1,752 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDStore.h" -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "MDSMap.h" - -#include "osd/OSDMap.h" -#include "osdc/Filer.h" - -#include "msg/Message.h" - -#include -#include -using namespace std; - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store " - - -/* - * separate hashed dir slices into "regions" - */ -size_t get_hash_offset(int hashcode) { - if (hashcode < 0) - return 0; // not hashed - else - return (size_t)(1<<30) * (size_t)(1+hashcode); -} - - - - -// ========================================================================== -// FETCH - - -class C_MDS_Fetch : public Context { - protected: - MDStore *ms; - inodeno_t ino; - - public: - C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() { - this->ms = ms; - this->ino = ino; - } - - void finish(int result) { - ms->fetch_dir_2( result, ino ); - } -}; - -/** fetch_dir(dir, context) - * public call to fetch a dir. - */ -void MDStore::fetch_dir( CDir *dir, - Context *c ) -{ - dout(7) << "fetch_dir " << *dir << " context is " << c << endl; - assert(dir->is_auth() || - dir->is_hashed()); - - // wait - if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c); - - // already fetching? - if (dir->state_test(CDIR_STATE_FETCHING)) { - dout(7) << "already fetching " << *dir << "; waiting" << endl; - return; - } - - // state - dir->state_set(CDIR_STATE_FETCHING); - - // stats - if (mds->logger) mds->logger->inc("fdir"); - - // create return context - Context *fin = new C_MDS_Fetch( this, dir->ino() ); - if (dir->is_hashed()) - fetch_dir_hash( dir, fin, mds->get_nodeid()); // hashed - else - fetch_dir_hash( dir, fin ); // normal -} - -/* - * called by low level fn when it's fetched. - * fix up dir state. - */ -void MDStore::fetch_dir_2( int result, - inodeno_t ino) -{ - CInode *idir = mds->mdcache->get_inode(ino); - - if (!idir || result < 0) return; // hmm! nevermind i guess. - - assert(idir); - CDir *dir = idir->dir; - assert(dir); - - // dir is now complete - dir->state_set(CDIR_STATE_COMPLETE); - dir->state_clear(CDIR_STATE_FETCHING); - - // finish - list finished; - dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished); - finish_contexts(finished, result); -} - - -/** low level methods **/ - -class C_MDS_FetchHash : public Context { -protected: - MDS *mds; - inode_t inode; - int hashcode; - Context *context; - -public: - bufferlist bl; - bufferlist bl2; - - C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() { - this->mds = mds; - this->inode = inode; - this->hashcode = hashcode; - this->context = c; - } - - void finish(int result) { - assert(result>0); - - // combine bufferlists bl + bl2 -> bl - bl.claim_append(bl2); - - // did i get the whole thing? - size_t size; - bl.copy(0, sizeof(size_t), (char*)&size); - size_t got = bl.length() - sizeof(size); - size_t left = size - got; - size_t from = bl.length(); - - // what part of dir are we getting? - from += get_hash_offset(hashcode); - - if (got >= size) { - // done. - mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode ); - } - else { - // read the rest! - dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode ); - fin->bl.claim( bl ); - mds->filer->read(inode, - from, left, - &fin->bl2, - fin ); - return; - } - } -}; - -/** fetch_dir_hash - * low level method. - * fetch part of a dir. either the whole thing if hashcode is -1, or a specific - * hash segment. - */ -void MDStore::fetch_dir_hash( CDir *dir, - Context *c, - int hashcode) -{ - dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode ); - - // grab first stripe bit (which had better be more than 16 bytes!) - assert(dir->get_inode()->inode.layout.stripe_size >= 16); - mds->filer->read(dir->get_inode()->inode, - get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size, - &fin->bl, - fin ); -} - -void MDStore::fetch_dir_hash_2( bufferlist& bl, - inode_t& inode, - Context *c, - int hashcode) -{ - CInode *idir = mds->mdcache->get_inode(inode.ino); - if (!idir) { - dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl; - c->finish(-1); - delete c; - return; - } - - if (!idir->dir_is_auth() || - !idir->dir) { - dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl; - c->finish(-1); - delete c; - return; - } - - // make sure we have a CDir - CDir *dir = idir->get_or_open_dir(mds->mdcache); - - // do it - dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl; - - // parse buffer contents into cache - dout(15) << "bl is " << bl << endl; - - int off = 0; - size_t size; - __uint32_t num; - version_t got_version; - int got_hashcode; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - assert(bl.length() >= size + sizeof(size)); - bl.copy(off, sizeof(num), (char*)&num); - off += sizeof(num); - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); - bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode); - off += sizeof(got_hashcode); - - assert(got_hashcode == hashcode); - - int buflen = bl.length(); - - dout(10) << " " << num << " items in " << size << " bytes" << endl; - - unsigned parsed = 0; - while (parsed < num) { - assert(off < buflen && num > 0); - parsed++; - - dout(24) << " " << parsed << "/" << num << " pos " << off << endl; - - // dentry - string dname; - ::_decode(dname, bl, off); - dout(24) << "parse filename '" << dname << "'" << endl; - - CDentry *dn = dir->lookup(dname); // existing dentry? - - char type = bl[off]; - ++off; - if (type == 'L') { - // hard link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - } - continue; - } - - // (remote) link - CDentry *dn = dir->add_dentry( dname, ino ); - - // link to inode? - CInode *in = mds->mdcache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl; - } else { - dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl; - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - - // under water? - if (dn->get_version() <= got_version) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; - dn->mark_clean(); - dn->get_inode()->mark_clean(); - } - } - continue; - } - - // add inode - CInode *in = 0; - if (mds->mdcache->have_inode(inode.ino)) { - in = mds->mdcache->get_inode(inode.ino); - dout(12) << "readdir got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << endl; - } else { - // inode - in = new CInode(mds->mdcache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) { - in->symlink = symlink; - } - - // add - mds->mdcache->add_inode( in ); - } - - // link - dir->add_dentry( dname, in ); - dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; - } - else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << endl; - assert(0); - } - } - dout(15) << "parsed " << parsed << endl; - - if (c) { - c->finish(0); - delete c; - } -} - - - - -// ================================================================== -// COMMIT - -class C_MDS_CommitDirVerify : public Context { -public: - MDS *mds; - inodeno_t ino; - version_t version; - Context *c; - - C_MDS_CommitDirVerify( MDS *mds, - inodeno_t ino, - version_t version, - Context *c) { - this->mds = mds; - this->c = c; - this->version = version; - this->ino = ino; - } - - virtual void finish(int r) { - - if (r >= 0) { - CInode *in = mds->mdcache->get_inode(ino); - assert(in && in->dir); - if (in && in->dir && in->dir->is_auth()) { - dout(7) << "CommitDirVerify: current = " << in->dir->get_version() - << ", last committed = " << in->dir->get_last_committed_version() - << ", required = " << version << endl; - - if (in->dir->get_last_committed_version() >= version) { - dout(7) << "my required version is safe, done." << endl; - if (c) { - c->finish(0); - delete c; - } - } else { - dout(7) << "my required version is still not safe, committing again." << endl; - - // what was requested isn't committed yet. - mds->mdstore->commit_dir(in->dir, - version, - c); - } - return; - } - } - - // must have exported ors omethign! - dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl; - - // finish. - if (c) { - c->finish(-1); - delete c; - } - } -}; - -class C_MDS_CommitDirFinish : public Context { - protected: - MDStore *ms; - CDir *dir; - version_t version; - - public: - - C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() { - this->ms = ms; - this->dir = dir; - this->version = dir->get_version(); // just for sanity check later - } - - void finish(int result) { - ms->commit_dir_2( result, dir, version ); - } -}; - - -void MDStore::commit_dir( CDir *dir, - Context *c ) -{ - assert(dir->is_dirty()); - - // commit thru current version - commit_dir(dir, dir->get_version(), c); -} - -void MDStore::commit_dir( CDir *dir, - version_t version, - Context *c ) -{ - assert(dir->is_auth() || - dir->is_hashed()); - - // already committing? - if (dir->state_test(CDIR_STATE_COMMITTING)) { - // already mid-commit! - dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl; - dout(7) << " current version = " << dir->get_version() << endl; - dout(7) << "requested version = " << version << endl; - - assert(version >= dir->get_last_committed_version()); // why would we request _old_ one? - - dir->add_waiter(CDIR_WAIT_COMMITTED, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - if (!dir->can_auth_pin()) { - // something must be frozen up the hiearchy! - dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // is it complete? - if (!dir->is_complete()) { - dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl; - // fetch dir first - fetch_dir(dir, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // ok go - dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl; - - // add waiter - if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c); - - // get continuation ready - Context *fin = new C_MDS_CommitDirFinish(this, dir); - - // state - dir->state_set(CDIR_STATE_COMMITTING); - dir->set_committing_version(); - - // stats - if (mds->logger) mds->logger->inc("cdir"); - - if (dir->is_hashed()) { - // hashed - commit_dir_slice( dir, fin, mds->get_nodeid() ); - } else { - // non-hashed - commit_dir_slice( dir, fin ); - } -} - -void MDStore::commit_dir_2( int result, - CDir *dir, - version_t committed_version) -{ - dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl; - assert(committed_version == dir->get_committing_version()); - - // remember which version is now safe - dir->set_last_committed_version(committed_version); - - // is the dir now clean? - if (committed_version == dir->get_version()) - dir->mark_clean(); - - dir->state_clear(CDIR_STATE_COMMITTING); - - // finish - dir->finish_waiting(CDIR_WAIT_COMMITTED); -} - - - - -// low-level committer (hashed or normal) - -class C_MDS_CommitSlice : public Context { - protected: - MDStore *ms; - CDir *dir; - Context *c; - int hashcode; - version_t version; - -public: - bufferlist bl; - - C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() { - this->ms = ms; - this->dir = dir; - this->c = c; - this->hashcode = w; - version = dir->get_version(); - } - - void finish(int result) { - ms->commit_dir_slice_2( result, dir, c, version, hashcode ); - } -}; - - -void MDStore::commit_dir_slice( CDir *dir, - Context *c, - int hashcode) -{ - if (hashcode >= 0) { - assert(dir->is_hashed()); - dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; - } else { - assert(dir->is_auth()); - dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; - } - - // get continuation ready - C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode); - - // fill buffer - __uint32_t num = 0; - - bufferlist dirdata; - - version_t v = dir->get_version(); - dirdata.append((char*)&v, sizeof(v)); - dirdata.append((char*)&hashcode, sizeof(hashcode)); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != hashcode) continue; - } - - if (dn->is_null()) continue; // skipping negative entry - - // primary or remote? - if (dn->is_remote()) { - - inodeno_t ino = dn->get_remote_ino(); - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl; - - // name, marker, ion - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "L", 1 ); // remote link - dirdata.append((char*)&ino, sizeof(ino)); - - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl; - - // name, marker, inode, [symlink string] - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "I", 1 ); // inode - dirdata.append( (char*) &in->inode, sizeof(inode_t)); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << endl; - dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); - } - } - - num++; - } - dout(14) << "num " << num << endl; - - // put count in buffer - //bufferlist bl; - size_t size = sizeof(num) + dirdata.length(); - fin->bl.append((char*)&size, sizeof(size)); - fin->bl.append((char*)&num, sizeof(num)); - fin->bl.claim_append(dirdata); //.c_str(), dirdata.length()); - assert(fin->bl.length() == size + sizeof(size)); - - // pin inode - dir->auth_pin(); - - // submit to osd - mds->filer->write( dir->get_inode()->inode, - 0, fin->bl.length(), - fin->bl, - 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write - NULL, fin ); // on safe -} - - -void MDStore::commit_dir_slice_2( int result, - CDir *dir, - Context *c, - version_t committed_version, - int hashcode ) -{ - dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl; - - // mark inodes and dentries clean too (if we committed them!) - list null_clean; - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); ) { - CDentry *dn = it->second; - it++; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dn->get_name() ); - if (dentryhashcode != hashcode) continue; - } - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; - } - - // only do primary... - if (!dn->is_primary()) - continue; - - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; - assert(in->is_dirty()); - } - } - - // unpin - dir->auth_unpin(); - - // finish - if (c) { - c->finish(0); - delete c; - } -} - - - - - - - - - - - - diff --git a/trunk/ceph/mds/MDStore.h b/trunk/ceph/mds/MDStore.h deleted file mode 100644 index fe7553608a975..0000000000000 --- a/trunk/ceph/mds/MDStore.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDSTORE_H -#define __MDSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -class MDS; -class CDir; -class Context; - -class MDStore { - protected: - MDS *mds; - - - public: - MDStore(MDS *m) { - mds = m; - } - - - // fetch - public: - void fetch_dir( CDir *dir, Context *c ); - protected: - void fetch_dir_2( int result, inodeno_t ino ); - - void fetch_dir_hash( CDir *dir, - Context *c, - int hashcode = -1); - void fetch_dir_hash_2( bufferlist &bl, - inode_t& inode, - Context *c, - int which); - friend class C_MDS_Fetch; - friend class C_MDS_FetchHash; - - // commit - public: - void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. - void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk - protected: - void commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); - - // low level committers - void commit_dir_slice( CDir *dir, - Context *c, - int hashcode = -1); - void commit_dir_slice_2( int result, - CDir *dir, - Context *c, - __uint64_t version, - int hashcode ); - - friend class C_MDS_CommitDirFinish; - friend class C_MDS_CommitSlice; -}; - - -#endif diff --git a/trunk/ceph/mds/Migrator.cc b/trunk/ceph/mds/Migrator.cc index 5d14bfbee4283..9319b78dcd4bd 100644 --- a/trunk/ceph/mds/Migrator.cc +++ b/trunk/ceph/mds/Migrator.cc @@ -11,7 +11,6 @@ * */ - #include "MDS.h" #include "MDCache.h" #include "CInode.h" @@ -19,7 +18,6 @@ #include "CDentry.h" #include "Migrator.h" #include "Locker.h" -#include "MDStore.h" #include "Migrator.h" #include "MDBalancer.h" @@ -29,8 +27,7 @@ #include "include/filepath.h" #include "events/EString.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" +#include "events/EExport.h" #include "events/EImportStart.h" #include "events/EImportFinish.h" @@ -40,10 +37,11 @@ #include "messages/MExportDirDiscover.h" #include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirCancel.h" #include "messages/MExportDirPrep.h" #include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" #include "messages/MExportDir.h" +#include "messages/MExportDirAck.h" #include "messages/MExportDirNotify.h" #include "messages/MExportDirNotifyAck.h" #include "messages/MExportDirFinish.h" @@ -75,79 +73,35 @@ void Migrator::dispatch(Message *m) switch (m->get_type()) { // import case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_dir_discover((MExportDirDiscover*)m); + handle_export_discover((MExportDirDiscover*)m); break; case MSG_MDS_EXPORTDIRPREP: - handle_export_dir_prep((MExportDirPrep*)m); + handle_export_prep((MExportDirPrep*)m); break; case MSG_MDS_EXPORTDIR: handle_export_dir((MExportDir*)m); break; case MSG_MDS_EXPORTDIRFINISH: - handle_export_dir_finish((MExportDirFinish*)m); + handle_export_finish((MExportDirFinish*)m); break; // export case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_dir_discover_ack((MExportDirDiscoverAck*)m); + handle_export_discover_ack((MExportDirDiscoverAck*)m); break; case MSG_MDS_EXPORTDIRPREPACK: - handle_export_dir_prep_ack((MExportDirPrepAck*)m); + handle_export_prep_ack((MExportDirPrepAck*)m); + break; + case MSG_MDS_EXPORTDIRACK: + handle_export_ack((MExportDirAck*)m); break; case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_dir_notify_ack((MExportDirNotifyAck*)m); + handle_export_notify_ack((MExportDirNotifyAck*)m); break; - // export 3rd party (inode authority) - case MSG_MDS_EXPORTDIRWARNING: - handle_export_dir_warning((MExportDirWarning*)m); - break; + // export 3rd party (dir_auth adjustments) case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_dir_notify((MExportDirNotify*)m); - break; - - - // hashing - case MSG_MDS_HASHDIRDISCOVER: - handle_hash_dir_discover((MHashDirDiscover*)m); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m); - break; - case MSG_MDS_HASHDIRPREP: - handle_hash_dir_prep((MHashDirPrep*)m); - break; - case MSG_MDS_HASHDIRPREPACK: - handle_hash_dir_prep_ack((MHashDirPrepAck*)m); - break; - case MSG_MDS_HASHDIR: - handle_hash_dir((MHashDir*)m); - break; - case MSG_MDS_HASHDIRACK: - handle_hash_dir_ack((MHashDirAck*)m); - break; - case MSG_MDS_HASHDIRNOTIFY: - handle_hash_dir_notify((MHashDirNotify*)m); - break; - - // unhashing - case MSG_MDS_UNHASHDIRPREP: - handle_unhash_dir_prep((MUnhashDirPrep*)m); - break; - case MSG_MDS_UNHASHDIRPREPACK: - handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m); - break; - case MSG_MDS_UNHASHDIR: - handle_unhash_dir((MUnhashDir*)m); - break; - case MSG_MDS_UNHASHDIRACK: - handle_unhash_dir_ack((MUnhashDirAck*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - handle_unhash_dir_notify((MUnhashDirNotify*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m); + handle_export_notify((MExportDirNotify*)m); break; default: @@ -171,32 +125,31 @@ void Migrator::export_empty_import(CDir *dir) { dout(7) << "export_empty_import " << *dir << endl; - return; // hack fixme - - if (!dir->is_import()) { - dout(7) << "not import (anymore?)" << endl; + if (dir->inode->is_auth()) return; + if (!dir->is_auth()) return; + + if (dir->inode->is_freezing() || dir->inode->is_frozen()) return; + if (dir->is_freezing() || dir->is_frozen()) return; + + if (dir->get_size() > 0) { + dout(7) << "not actually empty" << endl; return; } + if (dir->inode->is_root()) { dout(7) << "root" << endl; return; } - - if (dir->get_size() > 0) { - dout(7) << "not actually empty" << endl; - return; - } - + // is it really empty? if (!dir->is_complete()) { dout(7) << "not complete, fetching." << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_EmptyImport(this,dir)); + dir->fetch(new C_MDC_EmptyImport(this,dir)); return; } - - int dest = dir->inode->authority(); - + + int dest = dir->inode->authority().first; + // comment this out ot wreak havoc? //if (mds->is_shutting_down()) dest = 0; // this is more efficient. @@ -225,73 +178,118 @@ void Migrator::handle_mds_failure(int who) next++; CDir *dir = p->first; - if (export_peer[dir] == who) { - // the guy i'm exporting to failed. - // clean up. + // abort exports: + // - that are going to the failed node + // - that aren't frozen yet (to about auth_pin deadlock) + if (export_peer[dir] == who || + p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { + // the guy i'm exporting to failed, or we're just freezing. dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl; switch (p->second) { case EXPORT_DISCOVERING: - dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl; + dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << endl; dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze) + dir->auth_unpin(); + export_state.erase(dir); // clean up + dir->state_clear(CDir::STATE_EXPORTING); + if (export_peer[dir] != who) // tell them. + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), who, MDS_PORT_MIGRATOR); break; - + case EXPORT_FREEZING: - dout(10) << "state freezing : canceling freeze" << endl; + dout(10) << "export state=freezing : canceling freeze" << endl; dir->unfreeze_tree(); // cancel the freeze + export_state.erase(dir); // clean up + dir->state_clear(CDir::STATE_EXPORTING); + if (export_peer[dir] != who) // tell them. + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), who, MDS_PORT_MIGRATOR); break; - case EXPORT_LOGGINGSTART: + // NOTE: state order reversal, warning comes after loggingstart+prepping + case EXPORT_WARNING: + dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << endl; + // fall-thru + + //case EXPORT_LOGGINGSTART: case EXPORT_PREPPING: - dout(10) << "state loggingstart|prepping : logging EExportFinish(false)" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - // logger will unfreeze. + if (p->second != EXPORT_WARNING) + dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << endl; + // unpin bounds + for (set::iterator p = export_bounds[dir].begin(); + p != export_bounds[dir].end(); + ++p) { + CDir *bd = *p; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + dir->unfreeze_tree(); + cache->adjust_subtree_auth(dir, mds->get_nodeid()); + cache->try_subtree_merge(dir); + export_state.erase(dir); // clean up + dir->state_clear(CDir::STATE_EXPORTING); break; - + case EXPORT_EXPORTING: - dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - reverse_export(dir); - dir->unfreeze_tree(); + dout(10) << "export state=exporting : reversing, and unfreezing" << endl; + export_reverse(dir); + export_state.erase(dir); // clean up + dir->state_clear(CDir::STATE_EXPORTING); break; case EXPORT_LOGGINGFINISH: - dout(10) << "state loggingfinish : doing nothing, we were successful." << endl; + case EXPORT_NOTIFYING: + dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << endl; + // leave export_state, don't clean up now. break; default: assert(0); } - export_state.erase(dir); - export_peer.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // wake up any waiters - mds->queue_finished(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? - mds->mdcache->send_pending_import_maps(); - - mds->mdcache->show_imports(); - mds->mdcache->show_cache(); + // finish clean-up? + if (export_state.count(dir) == 0) { + export_peer.erase(dir); + export_warning_ack_waiting.erase(dir); + export_notify_ack_waiting.erase(dir); + + // unpin the path + vector trace; + cache->make_trace(trace, dir->inode); + mds->locker->dentry_anon_rdlock_trace_finish(trace); + + // wake up any waiters + mds->queue_waiters(export_finish_waiters[dir]); + export_finish_waiters.erase(dir); + + // send pending import_maps? (these need to go out when all exports have finished.) + cache->send_pending_import_maps(); + + cache->show_subtrees(); + } } else { - // third party failed. potential peripheral damage? - if (p->second == EXPORT_EXPORTING) { - // yeah, i'm waiting for acks, let's fake theirs. + // bystander failed. + if (p->second == EXPORT_WARNING) { + // exporter waiting for warning acks, let's fake theirs. + if (export_warning_ack_waiting[dir].count(who)) { + dout(10) << "faking export_warning_ack from mds" << who + << " on " << *dir << " to mds" << export_peer[dir] + << endl; + export_warning_ack_waiting[dir].erase(who); + export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. + if (export_warning_ack_waiting[dir].empty()) + export_go(dir); + } + } + if (p->second == EXPORT_NOTIFYING) { + // exporter is waiting for notify acks, fake it if (export_notify_ack_waiting[dir].count(who)) { - dout(10) << "faking export_dir_notify_ack from mds" << who + dout(10) << "faking export_notify_ack from mds" << who << " on " << *dir << " to mds" << export_peer[dir] << endl; export_notify_ack_waiting[dir].erase(who); - if (export_notify_ack_waiting[dir].empty()) - export_dir_acked(dir); + if (export_notify_ack_waiting[dir].empty()) + export_finish(dir); } } } @@ -302,41 +300,70 @@ void Migrator::handle_mds_failure(int who) // check my imports - map::iterator q = import_state.begin(); + map::iterator q = import_state.begin(); while (q != import_state.end()) { - map::iterator next = q; + map::iterator next = q; next++; - inodeno_t dirino = q->first; - CInode *diri = mds->mdcache->get_inode(dirino); - CDir *dir = 0; - if (diri) - dir = diri->dir; - - if (import_peer[dirino] == who) { - switch (import_peer[dirino]) { - case IMPORT_DISCOVERED: + dirfrag_t df = q->first; + CInode *diri = mds->mdcache->get_inode(df.ino); + CDir *dir = mds->mdcache->get_dirfrag(df); + + if (import_peer[df] == who) { + switch (import_state[df]) { + case IMPORT_DISCOVERING: + dout(10) << "import state=discovering : clearing state" << endl; + import_state.erase(df); + import_peer.erase(df); + break; + case IMPORT_DISCOVERED: + dout(10) << "import state=discovered : unpinning inode " << *diri << endl; + assert(diri); + // unpin base + diri->put(CInode::PIN_IMPORTING); + import_state.erase(df); + import_peer.erase(df); break; case IMPORT_PREPPING: - + if (import_state[df] == IMPORT_PREPPING) { + dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl; + } + assert(dir); + import_reverse_unpin(dir); // unpin break; case IMPORT_PREPPED: - + dout(10) << "import state=prepping : unpinning base+bounds, unfreezing " << *dir << endl; + assert(dir); + + // adjust auth back to me + cache->adjust_subtree_auth(dir, import_peer[df]); + cache->try_subtree_merge(dir); + + // bystanders? + if (import_bystanders[dir].empty()) { + import_reverse_unfreeze(dir); + } else { + // notify them; wait in aborting state + import_notify_abort(dir); + import_state[df] = IMPORT_ABORTING; + } break; case IMPORT_LOGGINGSTART: - + dout(10) << "import state=loggingstart : reversing import on " << *dir << endl; + import_reverse(dir); break; case IMPORT_ACKING: // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - // ... + dout(10) << "import state=acking : noting ambiguous import " << *dir << endl; + cache->add_ambiguous_import(dir, import_bounds[dir]); break; - case IMPORT_LOGGINGFINISH: - // do nothing, exporter is no longer involved. + case IMPORT_ABORTING: + dout(10) << "import state=aborting : ignoring repeat failure " << *dir << endl; break; } } @@ -350,6 +377,50 @@ void Migrator::handle_mds_failure(int who) +void Migrator::audit() +{ + if (g_conf.debug_mds < 5) return; // hrm. + + // import_state + for (map::iterator p = import_state.begin(); + p != import_state.end(); + p++) { + if (p->second == IMPORT_DISCOVERING) + continue; + if (p->second == IMPORT_DISCOVERED) { + CInode *in = cache->get_inode(p->first.ino); + assert(in); + continue; + } + CDir *dir = cache->get_dirfrag(p->first); + assert(dir); + if (p->second == IMPORT_PREPPING) + continue; + assert(dir->is_ambiguous_dir_auth()); + assert(dir->authority().first == mds->get_nodeid() || + dir->authority().second == mds->get_nodeid()); + } + + // export_state + for (map::iterator p = export_state.begin(); + p != export_state.end(); + p++) { + CDir *dir = p->first; + if (p->second == EXPORT_DISCOVERING || + p->second == EXPORT_FREEZING) continue; + assert(dir->is_ambiguous_dir_auth()); + assert(dir->authority().first == mds->get_nodeid() || + dir->authority().second == mds->get_nodeid()); + } + + // ambiguous+me subtrees should be importing|exporting + + // write me +} + + + + // ========================================================== // EXPORT @@ -358,29 +429,26 @@ void Migrator::handle_mds_failure(int who) class C_MDC_ExportFreeze : public Context { Migrator *mig; CDir *ex; // dir i'm exporting - int dest; public: - C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) : - mig(m), ex(e), dest(d) {} + C_MDC_ExportFreeze(Migrator *m, CDir *e) : + mig(m), ex(e) {} virtual void finish(int r) { if (r >= 0) - mig->export_dir_frozen(ex, dest); + mig->export_frozen(ex); } }; - /** export_dir(dir, dest) * public method to initiate an export. * will fail if the directory is freezing, frozen, unpinnable, or root. */ -void Migrator::export_dir(CDir *dir, - int dest) +void Migrator::export_dir(CDir *dir, int dest) { dout(7) << "export_dir " << *dir << " to " << dest << endl; + assert(dir->is_auth()); assert(dest != mds->get_nodeid()); - assert(!dir->is_hashed()); if (mds->mdsmap->is_degraded()) { dout(7) << "cluster degraded, no exports for now" << endl; @@ -389,7 +457,7 @@ void Migrator::export_dir(CDir *dir, if (dir->inode->is_root()) { dout(7) << "i won't export root" << endl; - assert(0); + //assert(0); return; } @@ -398,34 +466,33 @@ void Migrator::export_dir(CDir *dir, dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; return; } - if (dir->is_hashed()) { - dout(7) << "can't export hashed dir right now. implement me carefully later." << endl; + if (dir->state_test(CDir::STATE_EXPORTING)) { + dout(7) << "already exporting" << endl; return; } - // pin path? vector trace; cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { + if (!mds->locker->dentry_can_rdlock_trace(trace, 0)) { dout(7) << "export_dir couldn't pin path, failing." << endl; return; } - // ok, let's go. + // ok. + mds->locker->dentry_anon_rdlock_trace_start(trace); assert(export_state.count(dir) == 0); export_state[dir] = EXPORT_DISCOVERING; export_peer[dir] = dest; + dir->state_set(CDir::STATE_EXPORTING); + // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); - dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) + mds->send_message_mds(new MExportDirDiscover(dir), export_peer[dir], MDS_PORT_MIGRATOR); - // take away the popularity we're sending. FIXME: do this later? - mds->balancer->subtract_export(dir); - - // freeze the subtree - dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest)); + // start the freeze, but hold it up with an auth_pin. + dir->auth_pin(); + dir->freeze_tree(new C_MDC_ExportFreeze(this, dir)); } @@ -433,99 +500,102 @@ void Migrator::export_dir(CDir *dir, * called on receipt of MExportDirDiscoverAck * the importer now has the directory's _inode_ in memory, and pinned. */ -void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) +void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) { - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); - dout(7) << "export_dir_discover_ack from " << m->get_source() - << " on " << *dir << ", releasing auth_pin" << endl; - - export_state[dir] = EXPORT_FREEZING; + dout(7) << "export_discover_ack from " << m->get_source() + << " on " << *dir << endl; - dir->auth_unpin(); // unpin to allow freeze to complete + if (export_state.count(dir) == 0 || + export_state[dir] != EXPORT_DISCOVERING || + export_peer[dir] != m->get_source().num()) { + dout(7) << "must have aborted" << endl; + } else { + // freeze the subtree + export_state[dir] = EXPORT_FREEZING; + dir->auth_unpin(); + } delete m; // done } -class C_MDC_ExportStartLogged : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - int dest; - MExportDirPrep *prep; - -public: - C_MDC_ExportStartLogged(Migrator *m, CDir *e, int d, MExportDirPrep *p) : - mig(m), ex(e), dest(d), prep(p) {} - virtual void finish(int r) { - mig->export_dir_frozen_logged(ex, prep, dest); - } -}; - -void Migrator::export_dir_frozen(CDir *dir, - int dest) +void Migrator::export_frozen(CDir *dir) { - // subtree is now frozen! - dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl; - export_state[dir] = EXPORT_LOGGINGSTART; - - show_imports(); - - EExportStart *le = new EExportStart(dir, dest); - MExportDirPrep *prep = new MExportDirPrep(dir->inode); + dout(7) << "export_frozen on " << *dir << endl; + assert(dir->is_frozen()); + int dest = export_peer[dir]; + + // ok! + cache->show_subtrees(); + + // note the bounds. + // force it into a subtree by listing auth as . + cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); + cache->get_subtree_bounds(dir, export_bounds[dir]); + set &bounds = export_bounds[dir]; + + // generate prep message, log entry. + MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); + + // include list of bystanders + for (map::iterator p = dir->replicas_begin(); + p != dir->replicas_end(); + p++) { + if (p->first != dest) { + dout(10) << "bystander mds" << p->first << endl; + prep->add_bystander(p->first); + } + } // include spanning tree for all nested exports. // these need to be on the destination _before_ the final export so that // dir_auth updates on any nested exports are properly absorbed. - + // this includes inodes and dirfrags included in the subtree, but + // only the inodes at the bounds. set inodes_added; - // include base dir - prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) ); - le->metablob.add_dir( dir, false ); + // include base dirfrag + prep->add_dirfrag( new CDirDiscover(dir, dir->add_replica(dest)) ); - // also include traces to all nested exports. - set my_nested; - cache->find_nested_exports(dir, my_nested); - for (set::iterator it = my_nested.begin(); - it != my_nested.end(); + // check bounds + for (set::iterator it = bounds.begin(); + it != bounds.end(); it++) { - CDir *exp = *it; + CDir *bound = *it; + + // pin it. + bound->get(CDir::PIN_EXPORTBOUND); + bound->state_set(CDir::STATE_EXPORTBOUND); - dout(7) << " including nested export " << *exp << " in prep" << endl; + dout(7) << " export bound " << *bound << endl; - prep->add_export( exp->ino() ); - le->get_bounds().insert(exp->ino()); - le->metablob.add_dir_context( exp ); - le->metablob.add_dir( exp, false ); + prep->add_export( bound->dirfrag() ); /* first assemble each trace, in trace order, and put in message */ list inode_trace; // trace to dir - CDir *cur = exp; + CDir *cur = bound; while (cur != dir) { // don't repeat ourselves if (inodes_added.count(cur->ino())) break; // did already! inodes_added.insert(cur->ino()); - - CDir *parent_dir = cur->get_parent_dir(); - // inode? + // inode assert(cur->inode->is_auth()); inode_trace.push_front(cur->inode); dout(7) << " will add " << *cur->inode << endl; - // include dir? note: this'll include everything except the nested exports themselves, - // since someone else is obviously auth. - if (cur->is_auth()) { - prep->add_dir( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! + // include the dirfrag? only if it's not the bounding subtree root. + if (cur != bound) { + assert(cur->is_auth()); + prep->add_dirfrag( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! dout(7) << " added " << *cur << endl; } - cur = parent_dir; + cur = cur->get_parent_dir(); } for (list::iterator it = inode_trace.begin(); @@ -533,195 +603,126 @@ void Migrator::export_dir_frozen(CDir *dir, it++) { CInode *in = *it; dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->get_dir()->ino(), + prep->add_inode( in->parent->get_dir()->dirfrag(), in->parent->get_name(), in->replicate_to(dest) ); } } - - // log our intentions - dout(7) << " logging EExportStart" << endl; - mds->mdlog->submit_entry(le, new C_MDC_ExportStartLogged(this, dir, dest, prep)); -} - -void Migrator::export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest) -{ - dout(7) << "export_dir_frozen_logged " << *dir << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_LOGGINGSTART) { - // export must have aborted. - dout(7) << "export must have aborted, unfreezing and deleting me old prep message" << endl; - delete prep; - dir->unfreeze_tree(); // cancel the freeze - return; - } + // send. export_state[dir] = EXPORT_PREPPING; mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); } -void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) +void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) { - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); - dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl; - + dout(7) << "export_prep_ack " << *dir << endl; + if (export_state.count(dir) == 0 || export_state[dir] != EXPORT_PREPPING) { // export must have aborted. - dout(7) << "export must have aborted, unfreezing" << endl; - dir->unfreeze_tree(); + dout(7) << "export must have aborted" << endl; + delete m; return; } - // start export. - export_state[dir] = EXPORT_EXPORTING; - export_dir_go(dir, m->get_source().num()); + // send warnings + assert(export_peer.count(dir)); + int dest = export_peer[dir]; + assert(export_warning_ack_waiting.count(dir) == 0); + assert(export_notify_ack_waiting.count(dir) == 0); + for (map::iterator p = dir->replicas_begin(); + p != dir->replicas_end(); + ++p) { + if (p->first == dest) continue; + if (!mds->mdsmap->is_active_or_stopping(p->first)) + continue; // only if active + export_warning_ack_waiting[dir].insert(p->first); + export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too! + + MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, + pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), + pair(mds->get_nodeid(),export_peer[dir])); + notify->copy_bounds(export_bounds[dir]); + mds->send_message_mds(notify, p->first, MDS_PORT_MIGRATOR); + + } + export_state[dir] = EXPORT_WARNING; - // done + // nobody to warn? + if (export_warning_ack_waiting.count(dir) == 0) + export_go(dir); // start export. + + // done. delete m; } -void Migrator::export_dir_go(CDir *dir, - int dest) +void Migrator::export_go(CDir *dir) { - dout(7) << "export_dir_go " << *dir << " to " << dest << endl; + assert(export_peer.count(dir)); + int dest = export_peer[dir]; + dout(7) << "export_go " << *dir << " to " << dest << endl; - show_imports(); + cache->show_subtrees(); + + export_warning_ack_waiting.erase(dir); + export_state[dir] = EXPORT_EXPORTING; - assert(export_bounds.count(dir) == 0); + assert(export_bounds.count(dir) == 1); assert(export_data.count(dir) == 0); - // update imports/exports - CDir *containing_import = cache->get_auth_container(dir); - - if (containing_import == dir) { - dout(7) << " i'm rexporting a previous import" << endl; - assert(dir->is_import()); - cache->imports.erase(dir); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); // unpin, no longer an import - - // discard nested exports (that we're handing off - for (set::iterator p = cache->nested_exports[dir].begin(); - p != cache->nested_exports[dir].end(); ) { - CDir *nested = *p; - p++; - - // add to export message - export_bounds[dir].insert(nested); - - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl; - assert(cache->exports.count(nested) == 1); - cache->nested_exports[dir].erase(nested); - } - - } else { - dout(7) << " i'm a subdir nested under import " << *containing_import << endl; - cache->exports.insert(dir); - cache->nested_exports[containing_import].insert(dir); - - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); // i must keep it pinned - - // discard nested exports (that we're handing off) - for (set::iterator p = cache->nested_exports[containing_import].begin(); - p != cache->nested_exports[containing_import].end(); ) { - CDir *nested = *p; - p++; - if (nested == dir) continue; // ignore myself - - // container of parent; otherwise we get ourselves. - CDir *containing_export = nested->get_parent_dir(); - while (containing_export && !containing_export->is_export()) - containing_export = containing_export->get_parent_dir(); - if (!containing_export) continue; - - if (containing_export == dir) { - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl; - cache->nested_exports[containing_import].erase(nested); - // exports.erase(nested); _walk does this - - // add to msg - export_bounds[dir].insert(nested); - } else { - dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl; - assert(cache->get_auth_container(containing_export) != containing_import); - } - } - } - - // note new authority (locally) - if (dir->inode->authority() == dest) - dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - dir->set_dir_auth( dest ); - - - // make list of nodes i expect an export_dir_notify_ack from - // (everyone w/ this dir open, but me!) - assert(export_notify_ack_waiting[dir].empty()); - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - if (it->first == mds->get_nodeid()) continue; - export_notify_ack_waiting[dir].insert( it->first ); - - // send warning to all but dest - if (it->first != dest) { - dout(10) << " sending export_dir_warning to mds" << it->first << endl; - mds->send_message_mds(new MExportDirWarning( dir->ino() ), it->first, MDS_PORT_MIGRATOR); - } - } - assert(export_notify_ack_waiting[dir].count( dest )); + assert(dir->get_cum_auth_pins() == 0); + // set ambiguous auth + cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); + cache->verify_subtree_bounds(dir, export_bounds[dir]); + // fill export message with cache data C_Contexts *fin = new C_Contexts; // collect all the waiters int num_exported_inodes = encode_export_dir( export_data[dir], - fin, - dir, // base - dir, // recur start point - dest ); + fin, + dir, // base + dir, // recur start point + dest ); // send the export data! - MExportDir *req = new MExportDir(dir->ino()); + MExportDir *req = new MExportDir(dir->dirfrag()); // export state req->set_dirstate( export_data[dir] ); - // add bounds + // add bounds to message for (set::iterator p = export_bounds[dir].begin(); p != export_bounds[dir].end(); ++p) - req->add_export((*p)->ino()); + req->add_export((*p)->dirfrag()); //s end mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); // queue up the finisher - dir->add_waiter( CDIR_WAIT_UNFREEZE, fin ); + dir->add_waiter( CDir::WAIT_UNFREEZE, fin ); + // take away the popularity we're sending. FIXME: do this later? + mds->balancer->subtract_export(dir); // stats if (mds->logger) mds->logger->inc("ex"); if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - show_imports(); + cache->show_subtrees(); } /** encode_export_inode * update our local state for this inode to export. * encode relevant state to be sent over the wire. - * used by: export_dir_walk, file_rename (if foreign) + * used by: encode_export_dir, file_rename (if foreign) */ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) { @@ -734,7 +735,7 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au it->second.get_last_seq(), it->second.pending(), it->second.wanted(), - MClientFileCaps::FILECAP_STALE); + MClientFileCaps::OP_STALE); mds->messenger->send_message(m, mds->clientmap.get_inst(it->first), 0, MDS_PORT_CACHE); } @@ -757,10 +758,20 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au in->clear_replicas(); // twiddle lock states for auth -> replica transition - // hard - in->hardlock.clear_gather(); - if (in->hardlock.get_state() == LOCK_GLOCKR) - in->hardlock.set_state(LOCK_LOCK); + // auth + in->authlock.clear_gather(); + if (in->authlock.get_state() == LOCK_GLOCKR) + in->authlock.set_state(LOCK_LOCK); + + // link + in->linklock.clear_gather(); + if (in->linklock.get_state() == LOCK_GLOCKR) + in->linklock.set_state(LOCK_LOCK); + + // dirfragtree + in->dirfragtreelock.clear_gather(); + if (in->dirfragtreelock.get_state() == LOCK_GLOCKR) + in->dirfragtreelock.set_state(LOCK_LOCK); // file : we lost all our caps, so move to stable state! in->filelock.clear_gather(); @@ -784,8 +795,8 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au // mark auth assert(in->is_auth()); - in->set_auth(false); - in->replica_nonce = CINODE_EXPORT_NONCE; + in->state_clear(CInode::STATE_AUTH); + in->replica_nonce = CInode::EXPORT_NONCE; // *** other state too? @@ -803,7 +814,7 @@ int Migrator::encode_export_dir(list& dirstatelist, { int num_exported = 0; - dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; + dout(7) << "encode_export_dir " << *dir << " " << dir->nitems << " items" << endl; assert(dir->get_projected_version() == dir->get_version()); @@ -818,109 +829,78 @@ int Migrator::encode_export_dir(list& dirstatelist, // mark assert(dir->is_auth()); - dir->state_clear(CDIR_STATE_AUTH); - dir->replica_nonce = CDIR_NONCE_EXPORT; - - // proxy - dir->state_set(CDIR_STATE_PROXY); - dir->get(CDir::PIN_PROXY); - export_proxy_dirinos[basedir].push_back(dir->ino()); + dir->state_clear(CDir::STATE_AUTH); + dir->replica_nonce = CDir::NONCE_EXPORT; list subdirs; - if (dir->is_hashed()) { - // fix state - dir->state_clear( CDIR_STATE_AUTH ); - - } else { + if (dir->is_dirty()) + dir->mark_clean(); + + // discard most dir state + dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. + + // suck up all waiters + list waiting; + dir->take_waiting(CDir::WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // dentries + CDir_map_t::iterator it; + for (it = dir->begin(); it != dir->end(); it++) { + CDentry *dn = it->second; + CInode *in = dn->get_inode(); - if (dir->is_dirty()) - dir->mark_clean(); + num_exported++; - // discard most dir state - dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things. + // -- dentry + dout(7) << "encode_export_dir exporting " << *dn << endl; - // suck up all waiters - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); + // name + _encode(it->first, enc_dir); - // inodes + // state + it->second->encode_export_state(enc_dir); - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "export_dir_walk exporting " << *dn << endl; - _encode(it->first, enc_dir); - - if (dn->is_dirty()) - enc_dir.append("D", 1); // dirty - else - enc_dir.append("C", 1); // clean - - version_t dnv = dn->get_version(); - enc_dir.append((char*)&dnv, sizeof(dnv)); - - // null dentry? - if (dn->is_null()) { - enc_dir.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - enc_dir.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - enc_dir.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - enc_dir.append("I", 1); // inode dentry - - encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export + // points to... + + // null dentry? + if (dn->is_null()) { + enc_dir.append("N", 1); // null dentry + continue; + } + + if (dn->is_remote()) { + // remote link + enc_dir.append("L", 1); // remote link - // directory? - if (in->is_dir() && in->dir) { - if (in->dir->is_auth()) { - // nested subdir - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); - subdirs.push_back(in->dir); // it's ours, recurse (later) - - } else { - // nested export - assert(in->dir->get_dir_auth() >= 0); - dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl; - assert(cache->exports.count(in->dir) == 1); - cache->exports.erase(in->dir); // discard nested export (nested_exports updated above) - - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - // simplify dir_auth? - if (in->dir->get_dir_auth() == newauth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } + inodeno_t ino = dn->get_remote_ino(); + enc_dir.append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + enc_dir.append("I", 1); // inode dentry + + encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export + + // directory? + list dfs; + in->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *dir = *p; + if (!dir->state_test(CDir::STATE_EXPORTBOUND)) { + // include nested dirfrag + assert(dir->get_dir_auth().first == CDIR_AUTH_PARENT); + subdirs.push_back(dir); // it's ours, recurse (later) } - - // add to proxy - export_proxy_inos[basedir].push_back(in->ino()); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); } + + // waiters + list waiters; + in->take_waiting(CInode::WAIT_ANY, waiters); + fin->take(waiters); } // add to dirstatelist @@ -942,36 +922,44 @@ class C_MDS_ExportFinishLogged : public Context { public: C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} void finish(int r) { - migrator->export_dir_finish(dir); + migrator->export_logged_finish(dir); } }; /* - * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack) + * i should get an export_ack from the export target. */ -void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) +void Migrator::handle_export_ack(MExportDirAck *m) { - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); assert(dir->is_frozen_tree_root()); // i'm exporting! - // remove from waiting list - int from = m->get_source().num(); - assert(export_notify_ack_waiting[dir].count(from)); - export_notify_ack_waiting[dir].erase(from); + // yay! + dout(7) << "handle_export_ack " << *dir << endl; - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still need (" << export_notify_ack_waiting[dir] << ")" << endl; + export_warning_ack_waiting.erase(dir); - // done? - if (export_notify_ack_waiting[dir].empty()) { - export_dir_acked(dir); - } else { - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still waiting for " << export_notify_ack_waiting[dir] << endl; + export_state[dir] = EXPORT_LOGGINGFINISH; + export_data.erase(dir); + + // log completion + EExport *le = new EExport(dir); + le->metablob.add_dir( dir, false ); + for (set::iterator p = export_bounds[dir].begin(); + p != export_bounds[dir].end(); + ++p) { + CDir *bound = *p; + le->get_bounds().insert(bound->dirfrag()); + le->metablob.add_dir_context(bound); + le->metablob.add_dir(bound, false); } + + // log export completion, then finish (unfreeze, trigger finish context, etc.) + dir->get(CDir::PIN_LOGGINGEXPORTFINISH); + mds->mdlog->submit_entry(le, + new C_MDS_ExportFinishLogged(this, dir)); delete m; } @@ -983,77 +971,29 @@ void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) * that is, we don't know they safely received and logged it, so we reverse our changes * and go on. */ -void Migrator::reverse_export(CDir *dir) +void Migrator::export_reverse(CDir *dir) { - dout(7) << "reverse_export " << *dir << endl; + dout(7) << "export_reverse " << *dir << endl; assert(export_state[dir] == EXPORT_EXPORTING); assert(export_bounds.count(dir)); assert(export_data.count(dir)); - // re-import it. - set bounds; - bounds.swap(export_bounds[dir]); - export_bounds.erase(dir); - - // -- adjust dir_auth -- - // base - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } + // adjust auth, with possible subtree merge. + cache->verify_subtree_bounds(dir, export_bounds[dir]); + cache->adjust_subtree_auth(dir, mds->get_nodeid()); + cache->try_subtree_merge(dir); - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); + // unpin bounds + for (set::iterator p = export_bounds[dir].begin(); + p != export_bounds[dir].end(); ++p) { CDir *bd = *p; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); } - - // reimport the dirs - list imported_subdirs; + // re-import the metadata int num_imported_inodes = 0; for (list::iterator p = export_data[dir].begin(); @@ -1063,123 +1003,187 @@ void Migrator::reverse_export(CDir *dir) decode_import_dir(*p, export_peer[dir], dir, // import root - imported_subdirs, 0); } - // remove proxy bits - clear_export_proxy_pins(dir); + // process delayed expires + cache->process_delayed_expire(dir); + + // unfreeze + dir->unfreeze_tree(); // some clean up export_data.erase(dir); export_bounds.erase(dir); + export_warning_ack_waiting.erase(dir); export_notify_ack_waiting.erase(dir); -} - -void Migrator::export_dir_acked(CDir *dir) -{ - dout(7) << "export_dir_acked " << *dir << endl; - export_notify_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - export_data.erase(dir); - export_bounds.erase(dir); - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(new EExportFinish(dir, true), - new C_MDS_ExportFinishLogged(this, dir)); -} + cache->show_cache(); +} /* - * once i get all teh notify_acks i can finish + * once i get the ack, and logged the EExportFinish(true), + * send notifies (if any), otherwise go straight to finish. + * */ -void Migrator::export_dir_finish(CDir *dir) +void Migrator::export_logged_finish(CDir *dir) { - dout(7) << "export_dir_finish " << *dir << endl; + dout(7) << "export_logged_finish " << *dir << endl; + dir->put(CDir::PIN_LOGGINGEXPORTFINISH); - if (export_state.count(dir)) { - // send finish/commit to new auth - mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); + cache->verify_subtree_bounds(dir, export_bounds[dir]); - // remove from exporting list - export_state.erase(dir); - export_peer.erase(dir); - } else { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; - } + // send notifies + int dest = export_peer[dir]; + + for (set::iterator p = export_notify_ack_waiting[dir].begin(); + p != export_notify_ack_waiting[dir].end(); + ++p) { + MExportDirNotify *notify; + if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) + // dest is still alive. + notify = new MExportDirNotify(dir->dirfrag(), true, + pair(mds->get_nodeid(), dest), + pair(dest, CDIR_AUTH_UNKNOWN)); + else + // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() + notify = new MExportDirNotify(dir->dirfrag(), true, + pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), + pair(dest, CDIR_AUTH_UNKNOWN)); + + notify->copy_bounds(export_bounds[dir]); - // unfreeze - dout(7) << "export_dir_finish unfreezing" << endl; - dir->unfreeze_tree(); - - // unpin path - dout(7) << "export_dir_finish unpinning path" << endl; - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); + mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); + } - // unpin proxies - clear_export_proxy_pins(dir); + // wait for notifyacks + export_state[dir] = EXPORT_NOTIFYING; + + // no notifies to wait for? + if (export_notify_ack_waiting[dir].empty()) + export_finish(dir); // skip notify/notify_ack stage. +} + +/* + * warning: + * i'll get an ack from each bystander. + * when i get them all, do the export. + * notify: + * i'll get an ack from each bystander. + * when i get them all, unfreeze and send the finish. + */ +void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + assert(dir); + int from = m->get_source().num(); + + if (export_state.count(dir) && export_state[dir] == EXPORT_WARNING) { + // exporting. process warning. + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": exporting, processing warning on " + << *dir << endl; + assert(export_warning_ack_waiting.count(dir)); + export_warning_ack_waiting[dir].erase(from); + + if (export_warning_ack_waiting[dir].empty()) + export_go(dir); // start export. + } + else if (export_state.count(dir) && export_state[dir] == EXPORT_NOTIFYING) { + // exporting. process notify. + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": exporting, processing notify on " + << *dir << endl; + assert(export_notify_ack_waiting.count(dir)); + export_notify_ack_waiting[dir].erase(from); + + if (export_notify_ack_waiting[dir].empty()) + export_finish(dir); + } + else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { + // reversing import + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": aborting import on " + << *dir << endl; + assert(import_bystanders[dir].count(from)); + import_bystanders[dir].erase(from); + if (import_bystanders[dir].empty()) { + import_bystanders.erase(dir); + import_reverse_unfreeze(dir); + } + } + + delete m; +} + + +void Migrator::export_finish(CDir *dir) +{ + dout(7) << "export_finish " << *dir << endl; + + if (export_state.count(dir) == 0) { + dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; + return; + } + + // send finish/commit to new auth + if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) { + mds->send_message_mds(new MExportDirFinish(dir->dirfrag()), + export_peer[dir], MDS_PORT_MIGRATOR); + } else { + dout(7) << "not sending MExportDirFinish, dest has failed" << endl; + } + + // unfreeze + dout(7) << "export_finish unfreezing" << endl; + dir->unfreeze_tree(); + + // unpin bounds + for (set::iterator p = export_bounds[dir].begin(); + p != export_bounds[dir].end(); + ++p) { + CDir *bd = *p; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + + // adjust auth, with possible subtree merge. + // (we do this _after_ removing EXPORTBOUND pins, to allow merges) + cache->adjust_subtree_auth(dir, export_peer[dir]); + cache->try_subtree_merge(dir); + + // unpin path + dout(7) << "export_finish unpinning path" << endl; + vector trace; + cache->make_trace(trace, dir->inode); + mds->locker->dentry_anon_rdlock_trace_finish(trace); + + // discard delayed expires + cache->discard_delayed_expire(dir); + + // remove from exporting list, clean up state + dir->state_clear(CDir::STATE_EXPORTING); + export_state.erase(dir); + export_peer.erase(dir); + export_bounds.erase(dir); + export_notify_ack_waiting.erase(dir); // queue finishers - mds->queue_finished(export_finish_waiters[dir]); + mds->queue_waiters(export_finish_waiters[dir]); export_finish_waiters.erase(dir); // stats - if (mds->logger) mds->logger->set("nex", cache->exports.size()); + //if (mds->logger) mds->logger->set("nex", cache->exports.size()); - show_imports(); + cache->show_subtrees(); + audit(); // send pending import_maps? mds->mdcache->send_pending_import_maps(); } -void Migrator::clear_export_proxy_pins(CDir *dir) -{ - dout(10) << "clear_export_proxy_pins " << *dir << endl; - - // inodes - for (list::iterator it = export_proxy_inos[dir].begin(); - it != export_proxy_inos[dir].end(); - it++) { - CInode *in = cache->get_inode(*it); - dout(15) << " " << *in << endl; - in->put(CInode::PIN_PROXY); - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - } - export_proxy_inos.erase(dir); - - // dirs - for (list::iterator it = export_proxy_dirinos[dir].begin(); - it != export_proxy_dirinos[dir].end(); - it++) { - CDir *dir = cache->get_inode(*it)->dir; - dout(15) << " " << *dir << endl; - dir->put(CDir::PIN_PROXY); - assert(dir->state_test(CDIR_STATE_PROXY)); - dir->state_clear(CDIR_STATE_PROXY); - - // hose neg dentries, too, since we're no longer auth - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); ) { - CDentry *dn = it->second; - it++; - if (dn->is_null()) { - assert(dn->is_sync()); - dir->remove_dentry(dn); - } else { - //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; - if (dn->is_dirty()) - dn->mark_clean(); - } - } - } - export_proxy_dirinos.erase(dir); -} @@ -1189,113 +1193,123 @@ void Migrator::clear_export_proxy_pins(CDir *dir) // ========================================================== // IMPORT - -class C_MDC_ExportDirDiscover : public Context { - Migrator *mig; - MExportDirDiscover *m; -public: - vector trace; - C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) : - mig(mig_), m(m_) {} - void finish(int r) { - CInode *in = 0; - if (r >= 0) in = trace[trace.size()-1]->get_inode(); - mig->handle_export_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_export_dir_discover(MExportDirDiscover *m) +void Migrator::handle_export_discover(MExportDirDiscover *m) { assert(m->get_source().num() != mds->get_nodeid()); - dout(7) << "handle_export_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} + dout(7) << "handle_export_discover on " << m->get_path() << endl; -void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_export_dir_discover_2 has " << *in << endl; + // note import state + dirfrag_t df = m->get_dirfrag(); + + // only start discovering on this message once. + if (!m->started) { + m->started = true; + import_state[df] = IMPORT_DISCOVERING; + import_peer[df] = m->get_source().num(); } - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - - mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false), - m->get_source().num(), MDS_PORT_MIGRATOR); + // am i retrying after ancient path_traverse results? + if (import_state.count(df) == 0 && + import_state[df] != IMPORT_DISCOVERING) { + dout(7) << "hmm import_state is off, i must be obsolete lookup" << endl; delete m; return; } - - assert(in->is_dir()); - if (in->is_frozen()) { - dout(7) << "frozen, waiting." << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return; + // do we have it? + CInode *in = cache->get_inode(m->get_dirfrag().ino); + if (!in) { + // must discover it! + filepath fpath(m->get_path()); + vector trace; + int r = cache->path_traverse(0, + 0, + fpath, trace, true, + m, new C_MDS_RetryMessage(mds, m), // on delay/retry + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; // wait + if (r < 0) { + dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; + assert(0); // this shouldn't happen if the auth pins his path properly!!!! + } + + CInode *in; + if (trace.empty()) { + in = cache->get_root(); + if (!in) { + cache->open_root(new C_MDS_RetryMessage(mds, m)); + return; + } + } else { + in = trace[trace.size()-1]->inode; + } } + + // yay + import_discovered(in, df); + delete m; +} + +void Migrator::import_discovered(CInode *in, dirfrag_t df) +{ + dout(7) << "import_discovered " << df << " inode " << *in << endl; // pin inode in the cache (for now) + assert(in->is_dir()); in->get(CInode::PIN_IMPORTING); - // pin auth too, until the import completes. - in->auth_pin(); + // reply + dout(7) << " sending export_discover_ack on " << *in << endl; + mds->send_message_mds(new MExportDirDiscoverAck(df), + import_peer[df], MDS_PORT_MIGRATOR); +} + +void Migrator::handle_export_cancel(MExportDirCancel *m) +{ + dout(7) << "handle_export_cancel on " << m->get_dirfrag() << endl; + + if (import_state[m->get_dirfrag()] == IMPORT_DISCOVERED) { + CInode *in = cache->get_inode(m->get_dirfrag().ino); + assert(in); + in->put(CInode::PIN_IMPORTING); + } else { + assert(import_state[m->get_dirfrag()] == IMPORT_DISCOVERING); + } - import_state[in->ino()] = IMPORT_DISCOVERED; - import_peer[in->ino()] = m->get_source().num(); + import_state.erase(m->get_dirfrag()); + import_peer.erase(m->get_dirfrag()); - - // reply - dout(7) << " sending export_dir_discover_ack on " << *in << endl; - mds->send_message_mds(new MExportDirDiscoverAck(in->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); delete m; } - -void Migrator::handle_export_dir_prep(MExportDirPrep *m) +void Migrator::handle_export_prep(MExportDirPrep *m) { - assert(m->get_source().num() != mds->get_nodeid()); - - CInode *diri = cache->get_inode(m->get_ino()); + CInode *diri = cache->get_inode(m->get_dirfrag().ino); assert(diri); + int oldauth = m->get_source().num(); + assert(oldauth != mds->get_nodeid()); + list finished; // assimilate root dir. - CDir *dir = diri->dir; - if (dir) { - dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl; + CDir *dir; - if (!m->did_assim()) - m->get_dir(diri->ino())->update_dir(dir); + if (!m->did_assim()) { + dir = cache->add_replica_dir(diri, + m->get_dirfrag().frag, *m->get_dirfrag_discover(m->get_dirfrag()), + oldauth, finished); + dout(7) << "handle_export_prep on " << *dir << " (first pass)" << endl; } else { - assert(!m->did_assim()); - - // open dir i'm importing. - diri->set_dir( new CDir(diri, mds->mdcache, false) ); - dir = diri->dir; - m->get_dir(diri->ino())->update_dir(dir); - - dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl; - - diri->take_waiting(CINODE_WAIT_DIR, finished); + dir = cache->get_dirfrag(m->get_dirfrag()); + assert(dir); + dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << endl; } assert(dir->is_auth() == false); - show_imports(); + cache->show_subtrees(); // assimilate contents? if (!m->did_assim()) { @@ -1305,14 +1319,15 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) // move pin to dir diri->put(CInode::PIN_IMPORTING); dir->get(CDir::PIN_IMPORTING); - - // auth pin too - dir->auth_pin(); - diri->auth_unpin(); + dir->state_set(CDir::STATE_IMPORTING); // change import state - import_state[diri->ino()] = IMPORT_PREPPING; + import_state[dir->dirfrag()] = IMPORT_PREPPING; + // bystander list + import_bystanders[dir] = m->get_bystanders(); + dout(7) << "bystanders are " << import_bystanders[dir] << endl; + // assimilate traces to exports for (list::iterator it = m->get_inodes().begin(); it != m->get_inodes().end(); @@ -1327,49 +1342,44 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) (*it)->update_inode(in); // link to the containing dir - CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) ); - assert(condiri && condiri->dir); - cache->add_inode( in ); - condiri->dir->add_dentry( m->get_dentry(in->ino()), in ); + CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); + assert(condir); + cache->add_inode( in ); + condir->add_dentry( m->get_dentry(in->ino()), in ); dout(7) << " added " << *in << endl; } - assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) ); + assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - // dir - if (m->have_dir(in->ino())) { - if (in->dir) { - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " updated " << *in->dir << endl; - } else { - in->set_dir( new CDir(in, mds->mdcache, false) ); - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " added " << *in->dir << endl; - in->take_waiting(CINODE_WAIT_DIR, finished); - } + // dirs + for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); + pf != m->get_inode_dirfrags(in->ino()).end(); + ++pf) { + // add/update + cache->add_replica_dir(in, *pf, *m->get_dirfrag_discover(dirfrag_t(in->ino(), *pf)), + oldauth, finished); } } - // open export dirs? - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); + // open export dirs/bounds? + assert(import_bound_inos.count(dir->dirfrag()) == 0); + import_bound_inos[dir->dirfrag()].clear(); + for (list::iterator it = m->get_bounds().begin(); + it != m->get_bounds().end(); it++) { - dout(7) << " checking dir " << hex << *it << dec << endl; - CInode *in = cache->get_inode(*it); + dout(7) << " checking bound " << hex << *it << dec << endl; + CInode *in = cache->get_inode(it->ino); assert(in); // note bound. - import_bounds[dir->ino()].insert(*it); + import_bound_inos[dir->dirfrag()].push_back(*it); - if (!in->dir) { + CDir *dir = cache->get_dirfrag(*it); + if (!dir) { dout(7) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, + cache->open_remote_dir(in, it->frag, new C_MDS_RetryMessage(mds, m)); - - // pin it! - in->get(CInode::PIN_OPENINGDIR); - in->state_set(CInode::STATE_OPENINGDIR); } } } else { @@ -1377,43 +1387,49 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) } - // verify we have all exports + // verify we have all bounds int waiting_for = 0; - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); + for (list::iterator it = m->get_bounds().begin(); + it != m->get_bounds().end(); it++) { - inodeno_t ino = *it; - CInode *in = cache->get_inode(ino); - if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl; - assert(in); - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(7) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - - if (in->state_test(CInode::STATE_OPENINGDIR)) { - in->put(CInode::PIN_OPENINGDIR); - in->state_clear(CInode::STATE_OPENINGDIR); - } + dirfrag_t df = *it; + CDir *bound = cache->get_dirfrag(df); + if (bound) { + if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { + dout(7) << " pinning import bound " << *bound << endl; + bound->get(CDir::PIN_IMPORTBOUND); + bound->state_set(CDir::STATE_IMPORTBOUND); + import_bounds[dir].insert(bound); } else { - dout(7) << " already pinned nested export " << *in << endl; + dout(7) << " already pinned import bound " << *bound << endl; } } else { - dout(7) << " waiting for nested export dir on " << *in << endl; + dout(7) << " waiting for nested export dir on " << *cache->get_inode(df.ino) << endl; waiting_for++; } } + if (waiting_for) { dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; } else { + dout(7) << " all ready, noting auth and freezing import region" << endl; + + // note that i am an ambiguous auth for this subtree. + // specify bounds, since the exporter explicitly defines the region. + cache->adjust_bounded_subtree_auth(dir, import_bounds[dir], + pair(oldauth, mds->get_nodeid())); + cache->verify_subtree_bounds(dir, import_bounds[dir]); + + // freeze. + dir->_freeze_tree(); + // ok! - dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->ino()), + dout(7) << " sending export_prep_ack on " << *dir << endl; + mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), m->get_source().num(), MDS_PORT_MIGRATOR); // note new state - import_state[diri->ino()] = IMPORT_PREPPED; + import_state[dir->dirfrag()] = IMPORT_PREPPED; // done delete m; @@ -1426,143 +1442,39 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) -/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish. - * if it's the last one on the dir, it reprocessed the import. - */ -/* -class C_MDS_ImportPrediscover : public Context { -public: - MDS *mds; - MExportDir *m; - inodeno_t dir_ino; - string dentry; - C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) { - this->mds = mds; - this->m = m; - this->dir_ino = dir_ino; - this->dentry = dentry; - } - virtual void finish(int r) { - assert(r == 0); // should never fail! - - m->remove_prediscover(dir_ino, dentry); - - if (!m->any_prediscovers()) - mds->mdcache->handle_export_dir(m); - } -}; -*/ - class C_MDS_ImportDirLoggedStart : public Context { Migrator *migrator; CDir *dir; int from; - list imported_subdirs; - list exports; public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f, - list& is, list& e) : + C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f) : migrator(m), dir(d), from(f) { - imported_subdirs.swap(is); - exports.swap(e); } void finish(int r) { - migrator->import_dir_logged_start(dir, from, imported_subdirs, exports); + migrator->import_logged_start(dir, from); } }; void Migrator::handle_export_dir(MExportDir *m) { - CInode *diri = cache->get_inode(m->get_ino()); - assert(diri); - CDir *dir = diri->dir; + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); int oldauth = m->get_source().num(); dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; assert(dir->is_auth() == false); - show_imports(); - + cache->show_subtrees(); + // start the journal entry - EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); + EImportStart *le = new EImportStart(dir->dirfrag(), m->get_bounds()); le->metablob.add_dir_context(dir); - // note new authority (locally) - CDir *im = dir; - if (dir->inode->is_auth()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } + // adjust auth (list us _first_) + cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); + cache->verify_subtree_bounds(dir, import_bounds[dir]); - // take out my temp pin - dir->put(CDir::PIN_IMPORTING); - - // mark import point frozen - // (note: this is a manual freeze.. hack hack hack!) - dir->get_inode()->auth_pin(); - dir->state_set(CDIR_STATE_FROZENTREE); - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *bdi = cache->get_inode(*it); - CDir *bd = bdi->dir; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - assert(bd->is_import()); - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - // mark export point frozenleaf - bd->get(CDir::PIN_FREEZELEAF); - bd->state_set(CDIR_STATE_FROZENTREELEAF); - assert(import_bounds[dir->ino()].count(*it)); // we took note during prep stage - - // remove our pin - bd->put(CDir::PIN_IMPORTINGEXPORT); - bd->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - dout(10) << " bound " << *bd << endl; - } - // add this crap to my cache - list imported_subdirs; int num_imported_inodes = 0; for (list::iterator p = m->get_dirstate().begin(); @@ -1572,12 +1484,20 @@ void Migrator::handle_export_dir(MExportDir *m) decode_import_dir(*p, oldauth, dir, // import root - imported_subdirs, le); } - dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; - dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; + dout(10) << " " << m->get_bounds().size() << " imported bounds" << endl; + // include bounds in EImportStart + for (set::iterator it = import_bounds[dir].begin(); + it != import_bounds[dir].end(); + it++) { + CDir *bd = *it; + + // include bounding dirs in EImportStart + // (now that the interior metadata is already in the event) + le->metablob.add_dir(bd, false); + } // adjust popularity mds->balancer->add_import(dir); @@ -1586,120 +1506,235 @@ void Migrator::handle_export_dir(MExportDir *m) // log it mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num(), - imported_subdirs, m->get_exports())); + new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num())); // note state - import_state[dir->ino()] = IMPORT_LOGGINGSTART; + import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; // some stats if (mds->logger) { mds->logger->inc("im"); mds->logger->inc("iim", num_imported_inodes); - mds->logger->set("nim", cache->imports.size()); + //mds->logger->set("nim", cache->imports.size()); } delete m; } -void Migrator::import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports) +/* + * note: this does teh full work of reversing and import and cleaning up + * state. + * called by both handle_mds_failure and by handle_import_map (if we are + * a survivor coping with an exporter failure+recovery). + */ +void Migrator::import_reverse(CDir *dir, bool fix_dir_auth) { - dout(7) << "import_dir_logged " << *dir << endl; + dout(7) << "import_reverse " << *dir << endl; - // note state - import_state[dir->ino()] = IMPORT_ACKING; + // update auth, with possible subtree merge. + if (fix_dir_auth) { + assert(dir->is_subtree_root()); + cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); + cache->try_subtree_merge(dir); + } - // send notify's etc. - dout(7) << "sending notifyack for " << *dir << " to old auth mds" << from << endl; - mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), - from, MDS_PORT_MIGRATOR); - - dout(7) << "sending notify to others" << endl; - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - assert( it->first != mds->get_nodeid() ); - if ( it->first == from ) continue; // not to old auth. + // adjust auth bits. + list q; + q.push_back(dir); + while (!q.empty()) { + CDir *cur = q.front(); + q.pop_front(); - MExportDirNotify *notify = new MExportDirNotify(dir->ino(), from, mds->get_nodeid()); - notify->copy_exports(exports); + // dir + assert(cur->is_auth()); + cur->state_clear(CDir::STATE_AUTH); + cur->clear_replicas(); + if (cur->is_dirty()) + cur->mark_clean(); - if (g_conf.mds_verify_export_dirauth) - notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) - - mds->send_message_mds(notify, it->first, MDS_PORT_MIGRATOR); + CDir_map_t::iterator it; + for (it = cur->begin(); it != cur->end(); it++) { + CDentry *dn = it->second; + + // dentry + dn->state_clear(CDentry::STATE_AUTH); + dn->clear_replicas(); + if (dn->is_dirty()) + dn->mark_clean(); + + // inode? + if (dn->is_primary()) { + CInode *in = dn->get_inode(); + in->state_clear(CDentry::STATE_AUTH); + in->clear_replicas(); + if (in->is_dirty()) + in->mark_clean(); + in->authlock.clear_gather(); + in->linklock.clear_gather(); + in->dirfragtreelock.clear_gather(); + in->filelock.clear_gather(); + + // non-bounding dir? + list dfs; + in->get_dirfrags(dfs); + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) + if (!(*p)->state_test(CDir::STATE_IMPORTBOUND)) + q.push_back(*p); + } + } + } + + // log our failure + mds->mdlog->submit_entry(new EImportFinish(dir,false)); // log failure + + // bystanders? + if (import_bystanders[dir].empty()) { + dout(7) << "no bystanders, finishing reverse now" << endl; + import_reverse_unfreeze(dir); + } else { + // notify them; wait in aborting state + dout(7) << "notifying bystanders of abort" << endl; + import_notify_abort(dir); + import_state[dir->dirfrag()] = IMPORT_ABORTING; } +} + +void Migrator::import_notify_abort(CDir *dir) +{ + dout(7) << "import_notify_abort " << *dir << endl; - show_imports(); + for (set::iterator p = import_bystanders[dir].begin(); + p != import_bystanders[dir].end(); + ++p) { + // NOTE: the bystander will think i am _only_ auth, because they will have seen + // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). + MExportDirNotify *notify = + new MExportDirNotify(dir->dirfrag(), true, + pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), + pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); + notify->copy_bounds(import_bounds[dir]); + mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); + } } +void Migrator::import_reverse_unfreeze(CDir *dir) +{ + dout(7) << "import_reverse_unfreeze " << *dir << endl; -class C_MDS_ImportDirLoggedFinish : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { } - void finish(int r) { - migrator->import_dir_logged_finish(dir); - } -}; + // unfreeze + dir->unfreeze_tree(); -void Migrator::handle_export_dir_finish(MExportDirFinish *m) + // discard expire crap + cache->discard_delayed_expire(dir); + + import_reverse_unpin(dir); +} + +void Migrator::import_reverse_unpin(CDir *dir) { - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; - assert(dir); + dout(7) << "import_reverse_unpin " << *dir << endl; - dout(7) << "handle_export_dir_finish logging import_finish on " << *dir << endl; - assert(dir->is_auth()); + // remove importing pin + dir->put(CDir::PIN_IMPORTING); + dir->state_clear(CDir::STATE_IMPORTING); + + // remove bound pins + for (set::iterator it = import_bounds[dir].begin(); + it != import_bounds[dir].end(); + it++) { + CDir *bd = *it; + bd->put(CDir::PIN_IMPORTBOUND); + bd->state_clear(CDir::STATE_IMPORTBOUND); + } + + // clean up + import_state.erase(dir->dirfrag()); + import_peer.erase(dir->dirfrag()); + import_bound_inos.erase(dir->dirfrag()); + import_bounds.erase(dir); + import_bystanders.erase(dir); + + cache->show_subtrees(); + audit(); +} + + +void Migrator::import_logged_start(CDir *dir, int from) +{ + dout(7) << "import_logged " << *dir << endl; // note state - import_state[dir->ino()] = IMPORT_LOGGINGFINISH; + import_state[dir->dirfrag()] = IMPORT_ACKING; + + // send notify's etc. + dout(7) << "sending ack for " << *dir << " to old auth mds" << from << endl; + mds->send_message_mds(new MExportDirAck(dir->dirfrag()), + from, MDS_PORT_MIGRATOR); + + cache->show_subtrees(); +} - // log - mds->mdlog->submit_entry(new EImportFinish(dir, true), - new C_MDS_ImportDirLoggedFinish(this,dir)); + +void Migrator::handle_export_finish(MExportDirFinish *m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + assert(dir); + dout(7) << "handle_export_finish on " << *dir << endl; + import_finish(dir); delete m; } -void Migrator::import_dir_logged_finish(CDir *dir) +void Migrator::import_finish(CDir *dir, bool now) { - dout(7) << "import_dir_logged_finish " << *dir << endl; + dout(7) << "import_finish on " << *dir << endl; - // un auth pin (other exports can now proceed) - dir->auth_unpin(); - - // unfreeze! - for (set::iterator p = import_bounds[dir->ino()].begin(); - p != import_bounds[dir->ino()].end(); - ++p) { - CInode *diri = mds->mdcache->get_inode(*p); - CDir *dir = diri->dir; - assert(dir->state_test(CDIR_STATE_FROZENTREELEAF)); - dir->put(CDir::PIN_FREEZELEAF); - dir->state_clear(CDIR_STATE_FROZENTREELEAF); + // log finish + mds->mdlog->submit_entry(new EImportFinish(dir, true)); + + // remove pins + dir->put(CDir::PIN_IMPORTING); + dir->state_clear(CDir::STATE_IMPORTING); + + for (set::iterator it = import_bounds[dir].begin(); + it != import_bounds[dir].end(); + it++) { + CDir *bd = *it; + + // remove bound pin + bd->put(CDir::PIN_IMPORTBOUND); + bd->state_clear(CDir::STATE_IMPORTBOUND); } + // unfreeze dir->unfreeze_tree(); - + + // adjust auth, with possible subtree merge. + cache->verify_subtree_bounds(dir, import_bounds[dir]); + cache->adjust_subtree_auth(dir, mds->get_nodeid()); + cache->try_subtree_merge(dir); + // clear import state (we're done!) - import_state.erase(dir->ino()); - import_peer.erase(dir->ino()); - import_bounds.erase(dir->ino()); + import_state.erase(dir->dirfrag()); + import_peer.erase(dir->dirfrag()); + import_bound_inos.erase(dir->dirfrag()); + import_bounds.erase(dir); + import_bystanders.erase(dir); + + // process delayed expires + cache->process_delayed_expire(dir); // ok now finish contexts dout(5) << "finishing any waiters on imported data" << endl; - dir->finish_waiting(CDIR_WAIT_IMPORTED); + dir->finish_waiting(CDir::WAIT_IMPORTED); // log it if (mds->logger) { - mds->logger->set("nex", cache->exports.size()); - mds->logger->set("nim", cache->imports.size()); + //mds->logger->set("nex", cache->exports.size()); + //mds->logger->set("nim", cache->imports.size()); } - show_imports(); + cache->show_subtrees(); + audit(); // is it empty? if (dir->get_size() == 0 && @@ -1712,9 +1747,10 @@ void Migrator::import_dir_logged_finish(CDir *dir) void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth) { + dout(15) << "decode_import_inode on " << *dn << endl; + CInodeExport istate; off = istate._decode(bl, off); - dout(15) << "got a cinodeexport " << endl; bool added = false; CInode *in = cache->get_inode(istate.get_ino()); @@ -1722,7 +1758,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol in = new CInode(mds->mdcache); added = true; } else { - in->set_auth(true); + in->state_set(CInode::STATE_AUTH); } // state after link -- or not! -sage @@ -1746,18 +1782,17 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol // adjust replica list //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CINODE_EXPORT_NONCE ); + in->add_replica( oldauth, CInode::EXPORT_NONCE ); if (in->is_replica(mds->get_nodeid())) in->remove_replica(mds->get_nodeid()); // twiddle locks - // hard - if (in->hardlock.get_state() == LOCK_GLOCKR) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - in->hardlock.gather_set.erase(oldauth); - if (in->hardlock.gather_set.empty()) - mds->locker->inode_hard_eval(in); - } + if (in->authlock.do_import(oldauth, mds->get_nodeid())) + mds->locker->simple_eval(&in->authlock); + if (in->linklock.do_import(oldauth, mds->get_nodeid())) + mds->locker->simple_eval(&in->linklock); + if (in->dirfragtreelock.do_import(oldauth, mds->get_nodeid())) + mds->locker->simple_eval(&in->dirfragtreelock); // caps for (set::iterator it = merged_client_caps.begin(); @@ -1767,7 +1802,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol in->client_caps[*it].get_last_seq(), in->client_caps[*it].pending(), in->client_caps[*it].wanted(), - MClientFileCaps::FILECAP_REAP); + MClientFileCaps::OP_REAP); caps->set_mds( oldauth ); // reap from whom? mds->messenger->send_message(caps, mds->clientmap.get_inst(*it), @@ -1775,21 +1810,15 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol } // filelock - if (!in->filelock.is_stable()) { - // take me and old auth out of gather set - in->filelock.gather_set.erase(mds->get_nodeid()); - in->filelock.gather_set.erase(oldauth); - if (in->filelock.gather_set.empty()) // necessary but not suffient... - mds->locker->inode_file_eval(in); - } + if (in->filelock.do_import(oldauth, mds->get_nodeid())) + mds->locker->simple_eval(&in->filelock); } int Migrator::decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - list& imported_subdirs, - EImportStart *le) + int oldauth, + CDir *import_root, + EImportStart *le) { int off = 0; @@ -1797,23 +1826,19 @@ int Migrator::decode_import_dir(bufferlist& bl, CDirExport dstate; off = dstate._decode(bl, off); - CInode *diri = cache->get_inode(dstate.get_ino()); + CInode *diri = cache->get_inode(dstate.get_dirfrag().ino); assert(diri); - CDir *dir = diri->get_or_open_dir(mds->mdcache); + CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, dstate.get_dirfrag().frag); assert(dir); dout(7) << "decode_import_dir " << *dir << endl; - // add to list - if (dir != import_root) - imported_subdirs.push_back(dir->ino()); - // assimilate state dstate.update_dir( dir ); // mark (may already be marked from get_or_open_dir() above) if (!dir->is_auth()) - dir->state_set(CDIR_STATE_AUTH); + dir->state_set(CDir::STATE_AUTH); // adjust replica list //assert(!dir->is_replica(oldauth)); // not true on failed export @@ -1827,235 +1852,46 @@ int Migrator::decode_import_dir(bufferlist& bl, int num_imported = 0; - if (dir->is_hashed()) { - - // do nothing; dir is hashed - } else { - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; + // take all waiters on this dir + // NOTE: a pass of imported data is guaranteed to get all of my waiters because + // a replica's presense in my cache implies/forces it's presense in authority's. + list waiters; + + dir->take_waiting(CDir::WAIT_ANY, waiters); + for (list::iterator it = waiters.begin(); + it != waiters.end(); + it++) + import_root->add_waiter(CDir::WAIT_IMPORTED, *it); + + dout(15) << "doing contents" << endl; + + // contents + long nden = dstate.get_nden(); + + for (; nden>0; nden--) { - dir->take_waiting(CDIR_WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDIR_WAIT_IMPORTED, *it); + num_imported++; - dout(15) << "doing contents" << endl; + // dentry + string dname; + _decode(dname, bl, off); - // contents - long nden = dstate.get_nden(); - - for (; nden>0; nden--) { - - num_imported++; - - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char dirty; - bl.copy(off, 1, &dirty); - off++; - - version_t dnv; - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dentry dirty? - if (dirty == 'D') - dn->_mark_dirty(); - - dn->set_version( dnv ); - dn->set_projected_version( dnv ); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead? - } - - } - - dout(7) << "decode_import_dir done " << *dir << endl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_dir_warning(MExportDirWarning *m) -{ - // add to warning list - stray_export_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_export_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl; - - // process the notify - map::iterator it = stray_export_notifies.find(m->get_ino()); - handle_export_dir_notify(it->second); - stray_export_notifies.erase(it); - } else { - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Migrator::handle_export_dir_notify(MExportDirNotify *m) -{ - CDir *dir = 0; - CInode *in = cache->get_inode(m->get_ino()); - if (in) dir = in->dir; - - // did i see the warning yet? - if (!stray_export_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_export_notifies.insert(pair( m->get_ino(), m )); - return; - } - - // i did, we're all good. - dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl; - - // update dir_auth! - if (dir) { - dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl; - - // update bounds first - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *n = cache->get_inode(*it); - if (!n) continue; - CDir *ndir = n->dir; - if (!ndir) continue; - - int boundauth = ndir->authority(); - dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl; - if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) { - if (boundauth != m->get_new_auth()) - ndir->set_dir_auth( boundauth ); - else assert(dir->authority() == m->get_new_auth()); // apparently we already knew! - } else { - if (boundauth == m->get_new_auth()) - ndir->set_dir_auth( CDIR_AUTH_PARENT ); - } - } - - // update dir_auth - if (in->authority() == m->get_new_auth()) { - dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl; - dir->set_dir_auth( CDIR_AUTH_PARENT ); - assert(!in->is_auth()); - assert(!dir->is_auth()); - } else { - dir->set_dir_auth( m->get_new_auth() ); - } - assert(dir->authority() != mds->get_nodeid()); - assert(!dir->is_auth()); - - // DEBUG: verify subdirs - if (g_conf.mds_verify_export_dirauth) { - - dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl; - for (list::iterator it = m->subdirs_begin(); - it != m->subdirs_end(); - it++) { - CInode *diri = cache->get_inode(*it); - if (!diri) continue; // don't have it, don't care - if (!diri->dir) continue; - dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl; - assert(diri->dir != dir); // base shouldn't be in subdir list - if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) { - dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl; - assert(0); // bad news! - //dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - assert(diri->dir->authority() == m->get_new_auth()); - } - } - } - - // send notify ack to old auth - dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl; - mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()), - m->get_old_auth(), MDS_PORT_MIGRATOR); - - - // done - stray_export_warnings.erase( m->get_ino() ); - delete m; -} - - - - - -// ======================================================================= -// HASHING - - -void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) -{ - int off = 0; - - for (; nden>0; nden--) { - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dn dirty _after_ we link the inode (scroll down) - - if (icode == 'N') { - - // null dentry - assert(dn->is_null()); + CDentry *dn = dir->lookup(dname); + if (!dn) + dn = dir->add_dentry(dname); // null + + // decode state + dn->decode_import_state(bl, off, oldauth, mds->get_nodeid()); + dout(15) << "decode_import_dir got " << *dn << endl; + + // points to... + char icode; + bl.copy(off, 1, &icode); + off++; + + if (icode == 'N') { + // null dentry + assert(dn->is_null()); // fall thru } @@ -2064,1553 +1900,68 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol inodeno_t ino; bl.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - dir->link_inode(dn, ino); + if (dn->is_remote()) { + assert(dn->get_remote_ino() == ino); + } else { + dir->link_inode(dn, ino); + } } else if (icode == 'I') { // inode decode_import_inode(dn, bl, off, oldauth); - - // fix up subdir export? - if (dn->inode->dir) { - assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)); - dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - if (dn->inode->dir->is_auth()) { - // mine. must have been an import. - assert(dn->inode->dir->is_import()); - dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDir::PIN_IMPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORT); - - // move nested under hashdir - for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); - it != cache->nested_exports[dn->inode->dir].end(); - it++) - cache->nested_exports[dir].insert(*it); - cache->nested_exports.erase(dn->inode->dir); - - // now it matches the inode - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - else { - // not mine. make it an export. - dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDir::PIN_EXPORT); - dn->inode->dir->state_set(CDIR_STATE_EXPORT); - cache->exports.insert(dn->inode->dir); - cache->nested_exports[dir].insert(dn->inode->dir); - - if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT) - dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode - assert(dn->inode->dir->get_dir_auth() >= 0); - } - } } - // mark dentry dirty? (only _after_ we link the inode!) - dn->_mark_dirty(); // fixme + // add dentry to journal entry + if (le) + le->metablob.add_dentry(dn, dn->is_dirty()); } + + dout(7) << "decode_import_dir done " << *dir << endl; + return num_imported; } -/* - - notes on interaction of hashing and export/import: - - - dir->is_auth() is completely independent of hashing. for a hashed dir, - - all nodes are partially authoritative - - all nodes dir->is_hashed() == true - - all nodes dir->inode->dir_is_hashed() == true - - one node dir->is_auth() == true, the rest == false - - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. - - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - - on export, -1 is changed to mds->get_nodeid() - - on import, nothing special, actually. - - - hashed dir files aren't included in export; subdirs are converted to imports - or exports as necessary. - - hashed dir subdirs are discovered on export. this is important - because dirs are needed to tie together auth hierarchy, for auth to know about - imports/exports, etc. - - - dir state is maintained on auth. - - COMPLETE and HASHED are transfered to importers. - - DIRTY is set everywhere. - - - hashed dir is like an import: hashed dir used for nested_exports map. - - nested_exports is updated appropriately on auth and replicas. - - a subtree terminates as a hashed dir, since the hashing explicitly - redelegates all inodes. thus export_dir_walk includes hashed dirs, but - not their inodes. -*/ - -// HASH on auth - -class C_MDC_HashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->hash_dir_frozen(dir); - } -}; -class C_MDC_HashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashComplete(Migrator *mig, CDir *dir) { - this->mig = mig; - this->dir = dir; - } - virtual void finish(int r) { - mig->hash_dir_complete(dir); - } -}; +// authority bystander -/** hash_dir(dir) - * start hashing a directory. - */ -void Migrator::hash_dir(CDir *dir) +void Migrator::handle_export_notify(MExportDirNotify *m) { - dout(-7) << "hash_dir " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't hash, freezing|frozen." << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "hash_dir couldn't pin path, failing." << endl; - return; - } - - // ok, go - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // discover on all mds - assert(hash_gather.count(dir) == 0); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; // except me - hash_gather[dir].insert(i); - mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); - } - dir->auth_pin(); // pin until discovers are all acked. - - // start freeze - dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_HashComplete(this, dir)); - } else - hash_dir_complete(dir); -} - + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); -/* - * wait for everybody to discover and open the hashing dir - * then auth_unpin, to let the freeze happen - */ -void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete + pair old_auth = m->get_old_auth(); + pair new_auth = m->get_new_auth(); + + if (!dir) { + dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth + << " on missing dir " << m->get_dirfrag() << endl; + } else if (dir->authority() != old_auth) { + dout(7) << "handle_export_notify old_auth was " << dir->authority() + << " != " << old_auth << " -> " << new_auth + << " on " << *dir << endl; } else { - dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; - } - - delete m; // done -} - - - -/* - * once the dir is completely in memory, - * mark all migrating inodes dirty (to pin in cache) - */ -void Migrator::hash_dir_complete(CDir *dir) -{ - dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - in->_mark_dirty(); // fixme - } - - if (dir->is_frozen_dir()) - hash_dir_go(dir); -} - - -/* - * once the dir is frozen, - * make sure it's complete - * send the prep messages! - */ -void Migrator::hash_dir_frozen(CDir *dir) -{ - dout(7) << "hash_dir_frozen " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; - return; - } - - // send prep messages w/ export directories to open - vector msgs(mds->get_mds_map()->get_num_mds()); - - // check for subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; + dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth + << " on " << *dir << endl; + // adjust auth + cache->adjust_bounded_subtree_auth(dir, m->get_bounds(), new_auth); - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) continue; - - // msg? - if (msgs[dentryhashcode] == 0) { - msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); - } - msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); - } - - // send them! - assert(hash_gather[dir].empty()); - for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - hash_gather[dir].insert(i); - } + // induce a merge? + cache->try_subtree_merge(dir); } - if (hash_gather[dir].empty()) { - // no subdirs! continue! - hash_gather.erase(dir); - hash_dir_go(dir); - } else { - // wait! - } -} - -/* - * wait for peers to open all subdirs - */ -void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; - hash_dir_go(dir); + // send ack + if (m->wants_ack()) { + mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), + from, MDS_PORT_MIGRATOR); } else { - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * once the dir is frozen, - * make sure it's complete - * do the hashing! - */ -void Migrator::hash_dir_go(CDir *dir) -{ - dout(7) << "hash_dir_go " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - // get messages to other nodes ready - vector msgs(mds->get_mds_map()->get_num_mds()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - msgs[i] = new MHashDir(dir->ino()); + // aborted. no ack. + dout(7) << "handle_export_notify no ack requested" << endl; } - - // pick a hash seed. - dir->inode->inode.hash_seed = 1;//dir->ino(); - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - // get containing import. might be me. - CDir *containing_import = cache->get_auth_container(dir); - assert(containing_import != dir || dir->is_import()); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) { - continue; // still mine! - } - - bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); - assert(bl); - - // -- dentry - dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, *bl); - - // null dentry? - if (dn->is_null()) { - bl->append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl->append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl->append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl->append("I", 1); // inode dentry - - encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export - msgs[dentryhashcode]->inc_nden(); - - if (dn->is_dirty()) - dn->mark_clean(); - - // add to proxy - hash_proxy_inos[dir].push_back(in); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // fix up subdirs - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - - // fix nested bits - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == in->dir) { - dout(10) << "moving nested export " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[in->dir].insert(ex); - } - } - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[containing_import].erase(in->dir); - if (in->dir->authority() == dentryhashcode) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( in->dir->authority() ); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // inode state - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // fix up nested_exports? - if (containing_import != dir) { - dout(7) << "moving nested exports under hashed dir" << endl; - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == dir) { - dout(7) << " moving nested export under hashed dir: " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[dir].insert(ex); - } else { - dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; - } - } - } - - // send hash messages - assert(hash_gather[dir].empty()); - assert(hash_notify_gather[dir].empty()); - assert(dir->hashed_subset.empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - // all nodes hashed locally.. - dir->hashed_subset.insert(i); - - if (i == mds->get_nodeid()) continue; - - // init hash_gather and hash_notify_gather sets - hash_gather[dir].insert(i); - - assert(hash_notify_gather[dir][i].empty()); - for (int j=0; jget_mds_map()->get_num_mds(); j++) { - if (j == mds->get_nodeid()) continue; - if (j == i) continue; - hash_notify_gather[dir][i].insert(j); - } - - mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - } - - // wait for all the acks. + delete m; } -void Migrator::handle_hash_dir_ack(MHashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; - if (hash_notify_gather[dir].empty()) { - dout(7) << "got notifies too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "waiting on notifies " << endl; - } - - } else { - dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -void Migrator::hash_dir_finish(CDir *dir) -{ - dout(7) << "hash_dir_finish finishing " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - // dir state - hash_gather.erase(dir); - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - - // unproxy inodes - // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - hash_proxy_inos.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unfreeze - dir->unfreeze_dir(); - - show_imports(); - assert(hash_gather.count(dir) == 0); - - // stats - //if (mds->logger) mds->logger->inc("nh", 1); - -} - - - - -// HASH on auth and non-auth - -void Migrator::handle_hash_dir_notify(MHashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir_notify " << *dir << endl; - int from = m->get_from(); - - int source = m->get_source().num(); - if (dir->is_auth()) { - // gather notifies - assert(dir->is_hashed()); - - assert( hash_notify_gather[dir][from].count(source) ); - hash_notify_gather[dir][from].erase(source); - - if (hash_notify_gather[dir][from].empty()) { - dout(7) << "last notify from " << from << endl; - hash_notify_gather[dir].erase(from); - - if (hash_notify_gather[dir].empty()) { - dout(7) << "last notify!" << endl; - hash_notify_gather.erase(dir); - - if (hash_gather[dir].empty()) { - dout(7) << "got acks too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; - } - } else { - dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; - } - } else { - dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; - } - - // delete msg - delete m; - } else { - // update dir hashed_subset - assert(dir->hashed_subset.count(from) == 0); - dir->hashed_subset.insert(from); - - // update open subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - if (!in) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != from) continue; // we'll import these in a minute - - if (in->dir->authority() != dentryhashcode) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - - // remove from notify gather set - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - // last notify? - if (hash_gather[dir].empty()) { - dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; - hash_gather.erase(dir); - - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - } else { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - } - - // fw notify to auth - mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); - } -} - - - - -// HASH on non-auth - -/* - * discover step: - * each peer needs to open up the directory and pin it before we start - */ -class C_MDC_HashDirDiscover : public Context { - Migrator *mig; - MHashDirDiscover *m; -public: - vector trace; - C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { - this->mig = mig; - this->m = m; - } - void finish(int r) { - CInode *in = 0; - if (r >= 0) { - if (trace.size()) - in = trace[trace.size()-1]->get_inode(); - else - in = mig->cache->get_root(); - } - mig->handle_hash_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - assert(in->is_dir()); - - // is dir open? - if (!in->dir) { - dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - CDir *dir = in->dir; - - // pin dir, set hashing flag - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // inode state - dir->inode->inode.hash_seed = 1;// dir->ino(); - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // get gather set ready for notifies - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == dir->authority()) continue; - hash_gather[dir].insert(i); - } - - // reply - dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; - mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - -/* - * prep step: - * peers need to open up all subdirs of the hashed dir - */ - -void Migrator::handle_hash_dir_prep(MHashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_hash_dir_prep " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ack! - mds->send_message_mds(new MHashDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; -} - - -/* - * hash step: - */ - -void Migrator::handle_hash_dir(MHashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(!dir->is_auth()); - assert(!dir->is_hashed()); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir " << *dir << endl; - int oldauth = m->get_source().num(); - - // content - import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->hashed_subset.insert(mds->get_nodeid()); - - // dir is complete - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // commit - mds->mdstore->commit_dir(dir, 0); - - // send notifies - dout(7) << "sending notifies" << endl; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == m->get_source().num()) continue; - mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), - i, MDS_PORT_MIGRATOR); - } - - // ack - dout(7) << "acking" << endl; - mds->send_message_mds(new MHashDirAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; - - show_imports(); -} - - - - - -// UNHASH on auth - -class C_MDC_UnhashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_frozen(dir); - } -}; - -class C_MDC_UnhashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_complete(dir); - } -}; - - -void Migrator::unhash_dir(CDir *dir) -{ - dout(-7) << "unhash_dir " << *dir << endl; - - assert(dir->is_hashed()); - assert(!dir->is_unhashing()); - assert(dir->is_auth()); - assert(hash_gather.count(dir)==0); - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "unhash_dir couldn't pin path, failing." << endl; - return; - } - - // twiddle state - dir->state_set(CDIR_STATE_UNHASHING); - - // first, freeze the dir. - dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else - unhash_dir_complete(dir); - -} - -void Migrator::unhash_dir_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep(dir); -} - - -/* - * ask peers to freeze and complete hashed dir - */ -void Migrator::unhash_dir_prep(CDir *dir) -{ - dout(7) << "unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDirPrep(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * wait for peers to freeze and complete hashed dirs - */ -void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ok, done with this PrepAck - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; - unhash_dir_go(dir); - } else { - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * auth: - * send out MHashDir's to peers - */ -void Migrator::unhash_dir_go(CDir *dir) -{ - dout(7) << "unhash_dir_go " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDir(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * auth: - * assimilate unhashing content - */ -void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(dir->is_hashed()); - - // assimilate content - int from = m->get_source().num(); - import_hashed_content(dir, m->get_state(), m->get_nden(), from); - delete m; - - // done? - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; - return; - } - - // done! - - // fix up nested_exports - CDir *containing_import = cache->get_auth_container(dir); - if (containing_import != dir) { - for (set::iterator it = cache->nested_exports[dir].begin(); - it != cache->nested_exports[dir].end(); - it++) { - dout(7) << "moving nested export out from under hashed dir : " << **it << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(dir); - } - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); //later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - - // commit! - assert(dir->is_complete()); - //dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdstore->commit_dir(dir, 0); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); - } - - // notify - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - - hash_gather[dir].insert(i); - - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - - -/* - * sent by peer to flush mds links. unfreeze when all gathered. - */ -void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(dir->is_frozen_dir()); - - // done? - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - delete m; - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; - } else { - unhash_dir_finish(dir); - } -} - - -/* - * all mds links are flushed. unfreeze dir! - */ -void Migrator::unhash_dir_finish(CDir *dir) -{ - dout(7) << "unhash_dir_finish " << *dir << endl; - hash_gather.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // state - dir->state_clear(CDIR_STATE_UNHASHING); - - // unfreeze - dir->unfreeze_dir(); - -} - - - -// UNHASH on all - -/* - * hashed dir is complete. - * mark all migrating inodes dirty (to pin in cache) - * if frozen too, then go to next step (depending on auth) - */ -void Migrator::unhash_dir_complete(CDir *dir) -{ - dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(dir->is_hashed()); - assert(dir->is_complete()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - if (in->is_auth()) { - in->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash dirty fixme")); - } - } - - if (!dir->is_frozen_dir()) { - dout(7) << "dir complete but !frozen, waiting " << *dir << endl; - } else { - if (dir->is_auth()) - unhash_dir_prep(dir); // auth - else - unhash_dir_prep_finish(dir); // nonauth - } -} - - -// UNHASH on non-auth - -class C_MDC_UnhashPrepFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_prep_frozen(dir); - } -}; - - -/* - * peers need to freeze their dir and make them complete - */ -void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - - // freeze - dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else { - unhash_dir_complete(dir); - } - - delete m; -} - -/* - * peer has hashed dir frozen. - * complete too? - */ -void Migrator::unhash_dir_prep_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_prep_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_frozen_dir()); - assert(!dir->is_auth()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep_finish(dir); -} - -/* - * peer has hashed dir complete and frozen. ack. - */ -void Migrator::unhash_dir_prep_finish(CDir *dir) -{ - dout(7) << "unhash_dir_prep_finish " << *dir << endl; - assert(dir->is_hashed()); - assert(!dir->is_auth()); - assert(dir->is_frozen()); - assert(dir->is_complete()); - - // twiddle state - if (dir->is_unhashing()) - return; // already replied. - dir->state_set(CDIR_STATE_UNHASHING); - - // send subdirs back to auth - MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); - int auth = dir->authority(); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) continue; - - // msg? - ack->add_inode(it->first, in->replicate_to(auth)); - } - - // ack - mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); -} - - - -/* - * peer needs to send hashed dir content back to auth. - * unhash dir. - */ -void Migrator::handle_unhash_dir(MUnhashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; - assert(dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - // get message ready - bufferlist bl; - int nden = 0; - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) { - // not mine! - // twiddle dir_auth? - if (in->dir) { - if (in->dir->authority() != dir->authority()) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - continue; - } - - // -- dentry - dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, bl); - - // null dentry? - if (dn->is_null()) { - bl.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl.append("I", 1); // inode dentry - - encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export - nden++; - - if (dn->is_dirty()) - dn->mark_clean(); - - // proxy - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - hash_proxy_inos[dir].push_back(in); - - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[dir].erase(in->dir); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // we should have no nested exports; we're not auth for the dir! - assert(cache->nested_exports[dir].empty()); - cache->nested_exports.erase(dir); - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); // later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - dir->mark_clean(); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); - } - - // init gather set - mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); - hash_gather[dir].erase(mds->get_nodeid()); - - // send unhash message - mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), - dir->authority(), MDS_PORT_MIGRATOR); -} - - -/* - * first notify comes from auth. - * send notifies to all other peers, with peer = self - * if we get notify from peer=other, remove from our gather list. - * when we've gotten notifies from everyone, - * unpin proxies, - * send notify_ack to auth. - * this ensures that all mds links are flushed of cache_expire type messages. - */ -void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_finish " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - delete m; - - // did we send our shout out? - if (from == dir->authority()) { - // send notify to everyone else in weird chatter storm - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == from) continue; - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); - } - } - - // are we done? - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - return; - } - hash_gather.erase(dir); - - // all done! - dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; - - // unpin proxies - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // unfreeze - dir->unfreeze_dir(); - - // ack - dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; - mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - -} - - - - -void Migrator::show_imports() -{ - mds->balancer->show_imports(); -} diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h index dd2886008d163..391ece980674f 100644 --- a/trunk/ceph/mds/Migrator.h +++ b/trunk/ceph/mds/Migrator.h @@ -29,31 +29,17 @@ class CDir; class CInode; class CDentry; -class MExportDir; class MExportDirDiscover; class MExportDirDiscoverAck; +class MExportDirCancel; class MExportDirPrep; class MExportDirPrepAck; -class MExportDirWarning; +class MExportDir; +class MExportDirAck; class MExportDirNotify; class MExportDirNotifyAck; class MExportDirFinish; -class MHashDirDiscover; -class MHashDirDiscoverAck; -class MHashDirPrep; -class MHashDirPrepAck; -class MHashDir; -class MHashDirAck; -class MHashDirNotify; - -class MUnhashDirPrep; -class MUnhashDirPrepAck; -class MUnhashDir; -class MUnhashDirAck; -class MUnhashDirNotify; -class MUnhashDirNotifyAck; - class EImportStart; class Migrator { @@ -62,48 +48,56 @@ private: MDCache *cache; // -- exports -- +public: // export stages. used to clean up intelligently if there's a failure. const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart + //const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for acks - const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish - - // export fun - map export_state; - map export_peer; - map > export_bounds; - map > export_data; // only during EXPORTING state - map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from - map > export_proxy_inos; - map > export_proxy_dirinos; - - map > export_finish_waiters; + const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending + const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack + const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish + const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks + const static int EXPORT_ABORTING = 9; // notifying bystanders of abort - set stray_export_warnings; // notifies i haven't seen - map stray_export_notifies; +protected: + // export fun + map export_state; + map export_peer; + map > export_bounds; + map > export_data; // only during EXPORTING state + map > export_warning_ack_waiting; + map > export_notify_ack_waiting; + + map > export_finish_waiters; // -- imports -- - const static int IMPORT_DISCOVERED = 1; // waiting for prep - const static int IMPORT_PREPPING = 2; // opening dirs on bounds - const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart - const static int IMPORT_ACKING = 5; // logged, sent acks - const static int IMPORT_LOGGINGFINISH = 6; - - map import_state; - map import_peer; - map > import_bounds; - - +public: + const static int IMPORT_DISCOVERING = 1; // waiting for prep + const static int IMPORT_DISCOVERED = 2; // waiting for prep + const static int IMPORT_PREPPING = 3; // opening dirs on bounds + const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import + const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart + const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish + //const static int IMPORT_LOGGINGFINISH = 7; // logging EImportFinish + const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing + +protected: + map import_state; // FIXME make these dirfrags + map import_peer; + map > import_bound_inos; + map > import_bounds; + map > import_bystanders; + + + /* // -- hashing madness -- multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir // maps frozen_dir_ino's to waiting-for-discover ino's. multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - + */ public: @@ -119,27 +113,53 @@ public: return 0; } bool is_exporting() { return !export_state.empty(); } - int is_importing(inodeno_t dirino) { - if (import_state.count(dirino)) return import_state[dirino]; + int is_importing(dirfrag_t df) { + if (import_state.count(df)) return import_state[df]; return 0; } bool is_importing() { return !import_state.empty(); } - const set& get_import_bounds(inodeno_t base) { + const list& get_import_bound_inos(dirfrag_t base) { + assert(import_bound_inos.count(base)); + return import_bound_inos[base]; + } + const set& get_import_bounds(CDir *base) { assert(import_bounds.count(base)); return import_bounds[base]; } + int get_import_state(dirfrag_t df) { + assert(import_state.count(df)); + return import_state[df]; + } + int get_import_peer(dirfrag_t df) { + assert(import_peer.count(df)); + return import_peer[df]; + } + + int get_export_state(CDir *dir) { + assert(export_state.count(dir)); + return export_state[dir]; + } + // this returns true if we are export @dir, + // and are not waiting for @who to be + // be warned of ambiguous auth. + // only returns meaningful results during EXPORT_WARNING state. + bool export_has_warned(CDir *dir, int who) { + assert(is_exporting(dir)); + assert(export_state[dir] == EXPORT_WARNING); + return (export_warning_ack_waiting[dir].count(who) == 0); + } + // -- misc -- void handle_mds_failure(int who); - void show_imports(); + void audit(); // -- import/export -- // exporter public: - void export_dir(CDir *dir, - int mds); + void export_dir(CDir *dir, int dest); void export_empty_import(CDir *dir); void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth); @@ -151,113 +171,54 @@ public: void clear_export_proxy_pins(CDir *dir); protected: - void handle_export_dir_discover_ack(MExportDirDiscoverAck *m); - void export_dir_frozen(CDir *dir, int dest); - void export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest); - void handle_export_dir_prep_ack(MExportDirPrepAck *m); - void export_dir_go(CDir *dir, - int dest); + void handle_export_discover_ack(MExportDirDiscoverAck *m); + void export_frozen(CDir *dir); + void handle_export_prep_ack(MExportDirPrepAck *m); + void export_go(CDir *dir); int encode_export_dir(list& dirstatelist, class C_Contexts *fin, CDir *basedir, CDir *dir, int newauth); - void handle_export_dir_notify_ack(MExportDirNotifyAck *m); - void reverse_export(CDir *dir); - void export_dir_acked(CDir *dir); - void export_dir_finish(CDir *dir); + void export_reverse(CDir *dir); + void handle_export_ack(MExportDirAck *m); + void export_logged_finish(CDir *dir); + void handle_export_notify_ack(MExportDirNotifyAck *m); + void export_finish(CDir *dir); friend class C_MDC_ExportFreeze; - friend class C_MDC_ExportStartLogged; friend class C_MDS_ExportFinishLogged; + + // importer - void handle_export_dir_discover(MExportDirDiscover *m); - void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); - void handle_export_dir_prep(MExportDirPrep *m); + void handle_export_discover(MExportDirDiscover *m); + void handle_export_cancel(MExportDirCancel *m); + void import_discovered(CInode *in, dirfrag_t df); + void handle_export_prep(MExportDirPrep *m); void handle_export_dir(MExportDir *m); - void import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports); - void import_dir_logged_finish(CDir *dir); - void handle_export_dir_finish(MExportDirFinish *m); int decode_import_dir(bufferlist& bl, int oldauth, CDir *import_root, - list& imported_subdirs, EImportStart *le); - void got_hashed_replica(CDir *import, - inodeno_t dir_ino, - inodeno_t replica_ino); - friend class C_MDC_ExportDirDiscover; + +public: + void import_reverse(CDir *dir, bool fix_dir_auth=true); +protected: + void import_reverse_unfreeze(CDir *dir); + void import_reverse_unpin(CDir *dir); + void import_notify_abort(CDir *dir); + void import_logged_start(CDir *dir, int from); + void handle_export_finish(MExportDirFinish *m); +public: + void import_finish(CDir *dir, bool now=false); +protected: + friend class C_MDS_ImportDirLoggedStart; friend class C_MDS_ImportDirLoggedFinish; // bystander - void handle_export_dir_warning(MExportDirWarning *m); - void handle_export_dir_notify(MExportDirNotify *m); - - - // -- hashed directories -- - - // HASH - public: - void hash_dir(CDir *dir); // on auth - protected: - map< CDir*, set > hash_gather; - map< CDir*, map< int, set > > hash_notify_gather; - map< CDir*, list > hash_proxy_inos; - - // hash on auth - void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m); - void hash_dir_complete(CDir *dir); - void hash_dir_frozen(CDir *dir); - void handle_hash_dir_prep_ack(MHashDirPrepAck *m); - void hash_dir_go(CDir *dir); - void handle_hash_dir_ack(MHashDirAck *m); - void hash_dir_finish(CDir *dir); - friend class C_MDC_HashFreeze; - friend class C_MDC_HashComplete; - - // auth and non-auth - void handle_hash_dir_notify(MHashDirNotify *m); - - // hash on non-auth - void handle_hash_dir_discover(MHashDirDiscover *m); - void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r); - void handle_hash_dir_prep(MHashDirPrep *m); - void handle_hash_dir(MHashDir *m); - friend class C_MDC_HashDirDiscover; - - // UNHASH - public: - void unhash_dir(CDir *dir); // on auth - protected: - map< CDir*, list > unhash_content; - void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth); - - // unhash on auth - void unhash_dir_frozen(CDir *dir); - void unhash_dir_prep(CDir *dir); - void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m); - void unhash_dir_go(CDir *dir); - void handle_unhash_dir_ack(MUnhashDirAck *m); - void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m); - void unhash_dir_finish(CDir *dir); - friend class C_MDC_UnhashFreeze; - friend class C_MDC_UnhashComplete; - - // unhash on all - void unhash_dir_complete(CDir *dir); - - // unhash on non-auth - void handle_unhash_dir_prep(MUnhashDirPrep *m); - void unhash_dir_prep_frozen(CDir *dir); - void unhash_dir_prep_finish(CDir *dir); - void handle_unhash_dir(MUnhashDir *m); - void handle_unhash_dir_notify(MUnhashDirNotify *m); - friend class C_MDC_UnhashPrepFreeze; - + void handle_export_notify(MExportDirNotify *m); }; diff --git a/trunk/ceph/mds/Renamer.cc b/trunk/ceph/mds/Renamer.cc index cf7d79170f479..eadc26ea89a11 100644 --- a/trunk/ceph/mds/Renamer.cc +++ b/trunk/ceph/mds/Renamer.cc @@ -12,7 +12,6 @@ */ #include "MDCache.h" -#include "MDStore.h" #include "CInode.h" #include "CDir.h" #include "MDS.h" @@ -95,6 +94,13 @@ void Renamer::fix_renamed_dir(CDir *srcdir, dout(7) << "fix_renamed_dir on " << *in << endl; dout(7) << "fix_renamed_dir on " << *in->dir << endl; + + assert(0); // rewrite . + + // 1- fix subtree tree. + // 2- adjust subtree auth. + + /* if (in->dir->is_auth()) { // dir ours dout(7) << "dir is auth" << endl; @@ -102,36 +108,16 @@ void Renamer::fix_renamed_dir(CDir *srcdir, if (in->is_auth()) { // inode now ours - if (authchanged) { // inode _was_ replica, now ours - dout(7) << "inode was replica, now ours. removing from import list." << endl; - assert(in->dir->is_import()); - - // not import anymore! - cache->imports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_IMPORT); - in->dir->put(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // move my nested imports to in's containing import - CDir *con = cache->get_auth_container(in->dir); - assert(con); - for (set::iterator p = cache->nested_exports[in->dir].begin(); - p != cache->nested_exports[in->dir].end(); - p++) { - dout(7) << "moving nested export under new container " << *con << endl; - cache->nested_exports[con].insert(*p); - } - cache->nested_exports.erase(in->dir); - + dout(7) << "inode was replica, now ours." << endl; + cache->adjust_subtree_auth(dir, mds->get_nodeid()); } else { // inode was ours, still ours. dout(7) << "inode was ours, still ours." << endl; + assert(!in->dir->is_import()); - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); + assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT); // move any exports nested beneath me? CDir *newcon = cache->get_auth_container(in->dir); @@ -161,7 +147,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // i am now an import cache->imports.insert(in->dir); - in->dir->state_set(CDIR_STATE_IMPORT); + in->dir->state_set(CDir::STATE_IMPORT); in->dir->get(CDir::PIN_IMPORT); in->dir->set_dir_auth( mds->get_nodeid() ); @@ -189,7 +175,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, assert(in->dir->is_import()); // verify dir_auth - assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir. + assert(in->dir->get_dir_auth().first == mds->get_nodeid()); // me, because i'm auth for dir. assert(in->authority() != in->dir->get_dir_auth()); // inode not me. } @@ -210,7 +196,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // now export cache->exports.insert(in->dir); - in->dir->state_set(CDIR_STATE_EXPORT); + in->dir->state_set(CDir::STATE_EXPORT); in->dir->get(CDir::PIN_EXPORT); assert(dir_auth >= 0); // better be defined @@ -227,7 +213,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // sanity assert(in->dir->is_export()); - assert(in->dir->get_dir_auth() >= 0); + assert(in->dir->get_dir_auth().first >= 0); assert(in->dir->get_dir_auth() != in->authority()); // moved under new import? @@ -251,7 +237,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // remove from export list cache->exports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_EXPORT); + in->dir->state_clear(CDir::STATE_EXPORT); in->dir->put(CDir::PIN_EXPORT); CDir *oldcon = cache->get_auth_container(srcdir); @@ -264,7 +250,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, in->dir->set_dir_auth( CDIR_AUTH_PARENT ); dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; } else { - assert(in->dir->get_dir_auth() >= 0); // someone else's export, + assert(in->dir->get_dir_auth().first >= 0); // someone else's export, } } else { @@ -272,7 +258,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, dout(7) << "inode was replica, still replica. do nothing." << endl; // fix dir_auth? - if (in->authority() == dir_auth) + if (in->authority().first == dir_auth) in->dir->set_dir_auth( CDIR_AUTH_PARENT ); else in->dir->set_dir_auth( dir_auth ); @@ -284,8 +270,8 @@ void Renamer::fix_renamed_dir(CDir *srcdir, assert(!in->dir->is_export()); } } - - cache->show_imports(); + */ + cache->show_subtrees(); } /* @@ -346,8 +332,8 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) // determine the players - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); + int srcauth = srcdir->dentry_authority(srcdn->name).first; + int destauth = destdir->dentry_authority(destname).first; // FOREIGN rename? @@ -372,7 +358,7 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) srcauth); // tell dest who src is (maybe even me) mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - cache->show_imports(); + cache->show_subtrees(); } @@ -394,7 +380,7 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) assert(0); // set waiter on the inode (is this the best place?) - in->add_waiter(CINODE_WAIT_RENAMEACK, + in->add_waiter(CInode::WAIT_RENAMEACK, new C_MDC_RenameAck(this, srcdir, in, onfinish)); return; @@ -437,11 +423,11 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, + in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator // wait for finish - in->add_waiter(CINODE_WAIT_RENAMEACK, + in->add_waiter(CInode::WAIT_RENAMEACK, new C_MDC_RenameAck(this, srcdir, in, onfinish)); } else { // sweet, no notify necessary, we're done! @@ -457,7 +443,7 @@ void Renamer::handle_rename_ack(MRenameAck *m) dout(7) << "handle_rename_ack on " << *in << endl; // all done! - in->finish_waiting(CINODE_WAIT_RENAMEACK); + in->finish_waiting(CInode::WAIT_RENAMEACK); delete m; } @@ -467,7 +453,7 @@ void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) dout(10) << "file_rename_finish on " << *in << endl; // did i empty out an imported dir? FIXME this check should go somewhere else??? - if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0) + if (srcdir->is_auth() && !srcdir->inode->is_auth() && srcdir->get_size() == 0) cache->migrator->export_empty_import(srcdir); // finish our caller @@ -516,7 +502,7 @@ void Renamer::file_rename_foreign_src(CDentry *srcdn, assert(in); assert(in->is_auth()); - if (in->is_dir()) cache->show_imports(); + if (in->is_dir()) cache->show_subtrees(); // encode and export inode state bufferlist inode_state; @@ -560,8 +546,8 @@ void Renamer::file_rename_foreign_src(CDentry *srcdn, srcdn->_mark_dirty(); // fixme // proxy! - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); + //in->state_set(CInode::STATE_PROXY); + //in->get(CInode::PIN_PROXY); // generate notify list (everybody but src|dst) and send warnings set notify; @@ -574,7 +560,7 @@ void Renamer::file_rename_foreign_src(CDentry *srcdn, // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, + in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, new C_MDC_RenameNotifyAck(this, in, initiator)); } @@ -605,7 +591,7 @@ void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) if (rename_waiting_for_ack[in->ino()].empty()) { // last one! rename_waiting_for_ack.erase(in->ino()); - in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0); + in->finish_waiting(CInode::WAIT_RENAMENOTIFYACK, 0); } else { dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; } @@ -617,17 +603,17 @@ void Renamer::file_rename_ack(CInode *in, int initiator) // we got all our MNotifyAck's. // was i proxy (if not, it's cuz this was a local rename) - if (in->state_test(CInode::STATE_PROXY)) { + /*if (in->state_test(CInode::STATE_PROXY)) { dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; in->state_clear(CInode::STATE_PROXY); in->put(CInode::PIN_PROXY); - } + }*/ // done! if (initiator == mds->get_nodeid()) { // it's me, finish dout(7) << "file_rename_ack i am initiator, finishing" << endl; - in->finish_waiting(CINODE_WAIT_RENAMEACK); + in->finish_waiting(CInode::WAIT_RENAMEACK); } else { // send ack dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; @@ -665,8 +651,8 @@ void Renamer::handle_rename_prep(MRenamePrep *m) if (srcin->is_dir()) { if (!srcin->dir) { dout(7) << "handle_rename_prep need to open dir" << endl; - cache->open_remote_dir(srcin, - new C_MDS_RetryMessage(mds,m)); + cache->open_remote_dir(srcin, frag_t(), // FIXME dirfrag + new C_MDS_RetryMessage(mds,m)); return; } @@ -714,7 +700,7 @@ void Renamer::handle_rename(MRename *m) // note old dir auth int old_dir_auth = -1; - if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority(); + if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority().first; // rename replica into position if (destdn->inode && destdn->inode->is_dirty()) @@ -845,7 +831,7 @@ void Renamer::handle_rename_notify(MRenameNotify *m) CInode *in = srcdn->inode; int old_dir_auth = -1; - if (in && in->dir) old_dir_auth = in->dir->authority(); + if (in && in->dir) old_dir_auth = in->dir->authority().first; if (!destdn) { destdn = destdir->add_dentry(m->get_destname()); // create null dentry @@ -873,8 +859,8 @@ void Renamer::handle_rename_notify(MRenameNotify *m) if (destdiri) { dout(7) << "have destdiri, opening dir " << *destdiri << endl; - cache->open_remote_dir(destdiri, - new C_MDS_RetryMessage(mds,m)); + cache->open_remote_dir(destdiri, frag_t(), // FIXME dirfrag + new C_MDS_RetryMessage(mds,m)); } else { filepath destdirpath = m->get_destdirpath(); dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; diff --git a/trunk/ceph/mds/ScatterLock.h b/trunk/ceph/mds/ScatterLock.h new file mode 100644 index 0000000000000..ec5301774efa3 --- /dev/null +++ b/trunk/ceph/mds/ScatterLock.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __SCATTERLOCK_H +#define __SCATTERLOCK_H + +#include "SimpleLock.h" + + +// lock state machine states. +#define LOCK_SYNC__ // rdlocks allowed (e.g., for stat) +#define LOCK_GSYNCS -20 // waiting for replicas to gather +#define LOCK_SCATTER 21 // mtime updates on replicas allowed, no reads. +#define LOCK_GSCATTERS 22 // waiting for rdlocks to release + +inline const char *get_scatterlock_state_name(int s) { + switch(s) { + case LOCK_SYNC: return "sync"; + case LOCK_GSYNCS: return "gsyncs"; + case LOCK_SCATTER: return "scatter"; + case LOCK_GSCATTERS: return "gscatters"; + default: assert(0); + } +} + +class ScatterLock : public SimpleLock { + int num_wrlock; + +public: + ScatterLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) {} + + char get_replica_state() { + switch (state) { + case LOCK_SYNC: + case LOCK_GSYNCS: + case LOCK_GSCATTERS: + return LOCK_SYNC; + case LOCK_SCATTER: + return LOCK_SCATTER; + default: + assert(0); + } + } + + void replicate_relax() { + if (state == LOCK_SYNC && !is_rdlocked()) + state = LOCK_SCATTER; + } + + // rdlock + bool can_rdlock(MDRequest *mdr) { + return state == LOCK_SYNC; + } + bool can_rdlock_soon() { + return state == LOCK_SYNC || state == LOCK_GSYNCS; + } + + // wrlock + bool can_wrlock() { + return state == LOCK_SCATTER; + } + void get_wrlock() { + assert(state == LOCK_SCATTER); + ++num_wrlock; + } + void put_wrlock() { + --num_wrlock; + } + bool is_wrlocked() { return num_wrlock > 0; } + int get_num_wrlocks() { return num_wrlock; } + + void print(ostream& out) { + out << "("; + //out << get_lock_type_name(l.get_type()) << " "; + out << get_scatterlock_state_name(get_state()); + if (!get_gather_set().empty()) out << " g=" << get_gather_set(); + if (is_rdlocked()) + out << " r=" << get_num_rdlocks(); + //if (l.is_xlocked()) + //out << " x=" << l.get_xlocked_by(); + if (is_wrlocked()) + out << " wr=" << get_num_wrlocks(); + out << ")"; + } + +}; + +#endif diff --git a/trunk/ceph/mds/Server.cc b/trunk/ceph/mds/Server.cc index 736913f301cb1..72353f73d0436 100644 --- a/trunk/ceph/mds/Server.cc +++ b/trunk/ceph/mds/Server.cc @@ -18,24 +18,24 @@ #include "MDLog.h" #include "Migrator.h" #include "MDBalancer.h" -#include "Renamer.h" -#include "MDStore.h" +#include "AnchorClient.h" #include "msg/Messenger.h" -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" +#include "messages/MClientSession.h" #include "messages/MClientRequest.h" #include "messages/MClientReply.h" -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" +#include "messages/MClientReconnect.h" #include "messages/MLock.h" +#include "messages/MDentryUnlink.h" #include "messages/MInodeLink.h" #include "events/EString.h" #include "events/EUpdate.h" +#include "events/ESession.h" +#include "events/EOpen.h" #include "include/filepath.h" #include "common/Timer.h" @@ -57,6 +57,12 @@ using namespace std; void Server::dispatch(Message *m) { + switch (m->get_type()) { + case MSG_CLIENT_RECONNECT: + handle_client_reconnect((MClientReconnect*)m); + return; + } + // active? if (!mds->is_active()) { dout(3) << "not active yet, waiting" << endl; @@ -65,117 +71,243 @@ void Server::dispatch(Message *m) } switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - handle_client_mount((MClientMount*)m); + case MSG_CLIENT_SESSION: + handle_client_session((MClientSession*)m); return; - case MSG_CLIENT_UNMOUNT: - handle_client_unmount(m); - return; - } - - - switch (m->get_type()) { case MSG_CLIENT_REQUEST: handle_client_request((MClientRequest*)m); return; - - case MSG_MDS_HASHREADDIR: - handle_hash_readdir((MHashReaddir*)m); - return; - case MSG_MDS_HASHREADDIRREPLY: - handle_hash_readdir_reply((MHashReaddirReply*)m); - return; - } - dout(1) << " main unknown message " << m->get_type() << endl; + dout(1) << "server unknown message " << m->get_type() << endl; assert(0); } +// ---------------------------------------------------------- +// SESSION management + + +class C_MDS_session_finish : public Context { + MDS *mds; + entity_inst_t client_inst; + bool open; + version_t cmapv; +public: + C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : + mds(m), client_inst(ci), open(s), cmapv(mv) { } + void finish(int r) { + assert(r == 0); + mds->server->_session_logged(client_inst, open, cmapv); + } +}; -void Server::handle_client_mount(MClientMount *m) +void Server::handle_client_session(MClientSession *m) { - int n = m->get_source().num(); - dout(3) << "mount by client" << n << endl; - mds->clientmap.add_mount(n, m->get_source_inst()); + dout(3) << "handle_client_session " << *m << " from " << m->get_source() << endl; + int from = m->get_source().num(); + bool open = m->op == MClientSession::OP_OPEN; - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts + if (open) { + if (mds->clientmap.is_opening(from)) { + dout(10) << "already opening, dropping this req" << endl; + delete m; + return; + } + mds->clientmap.add_opening(from); + } else { + if (mds->clientmap.is_closing(from)) { + dout(10) << "already closing, dropping this req" << endl; + delete m; + return; + } + mds->clientmap.add_closing(from); + } - // ack - messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), - m->get_source_inst()); + // journal it + version_t cmapv = mds->clientmap.inc_projected(); + mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), + new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); delete m; } -void Server::handle_client_unmount(Message *m) +void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) +{ + dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") + << " " << cmapv + << endl; + + // apply + int from = client_inst.name.num(); + if (open) { + assert(mds->clientmap.is_opening(from)); + mds->clientmap.open_session(client_inst); + } else { + assert(mds->clientmap.is_closing(from)); + mds->clientmap.close_session(from); + + // purge completed requests from clientmap + mds->clientmap.trim_completed_requests(from, 0); + } + + assert(cmapv == mds->clientmap.get_version()); + + // reply + if (open) + mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN_ACK), client_inst); + else + mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE_ACK), client_inst); +} + + +void Server::terminate_sessions() { - int n = m->get_source().num(); - dout(3) << "unmount by client" << n << endl; + dout(2) << "terminate_sessions" << endl; - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts + // kill them off. clients will retry etc. + for (set::const_iterator p = mds->clientmap.get_session_set().begin(); + p != mds->clientmap.get_session_set().end(); + ++p) { + if (mds->clientmap.is_closing(*p)) + continue; + mds->clientmap.add_closing(*p); + version_t cmapv = mds->clientmap.inc_projected(); + mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), + new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); + } +} - mds->clientmap.rem_mount(n); - if (g_conf.mds_shutdown_on_last_unmount && - mds->clientmap.get_mount_set().empty()) { - dout(3) << "all clients done, initiating shutdown" << endl; - mds->shutdown_start(); +void Server::reconnect_clients() +{ + // reconnect with clients + if (mds->clientmap.get_session_set().empty()) { + dout(7) << "reconnect_clients -- no sessions, doing nothing." << endl; + reconnect_finish(); + return; } + + dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << endl; + mds->set_want_state(MDSMap::STATE_RECONNECT); // just fyi. + + // send mdsmap to all mounted clients + mds->bcast_mds_map(); - // ack by sending back to client - messenger->send_message(m, m->get_source_inst()); + // init gather list + reconnect_start = g_clock.now(); + client_reconnect_gather = mds->clientmap.get_session_set(); } +void Server::handle_client_reconnect(MClientReconnect *m) +{ + dout(7) << "handle_client_reconnect " << m->get_source() << endl; + int from = m->get_source().num(); + if (m->closed) { + dout(7) << " client had no session, removing from clientmap" << endl; -/******* - * some generic stuff for finishing off requests - */ - -/** C_MDS_CommitRequest - */ + mds->clientmap.add_closing(from); + version_t cmapv = mds->clientmap.inc_projected(); + mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), + new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); -class C_MDS_CommitRequest : public Context { - Server *server; - MClientRequest *req; - MClientReply *reply; - CInode *tracei; // inode to include a trace for - LogEvent *event; + } else { -public: - C_MDS_CommitRequest(Server *server, - MClientRequest *req, MClientReply *reply, CInode *tracei, - LogEvent *event=0) { - this->server = server; - this->req = req; - this->tracei = tracei; - this->reply = reply; - this->event = event; - } - void finish(int r) { - if (r != 0) { - // failure. set failure code and reply. - reply->set_result(r); + // caps + for (map::iterator p = m->inode_caps.begin(); + p != m->inode_caps.end(); + ++p) { + CInode *in = mdcache->get_inode(p->first); + if (!in) { + dout(0) << "missing " << p->first << ", fetching via " << m->inode_path[p->first] << endl; + assert(0); + continue; + } + + dout(10) << " client cap " << cap_string(p->second.wanted) + << " seq " << p->second.seq + << " on " << *in << endl; + Capability cap(p->second.wanted, p->second.seq); + in->add_client_cap(from, cap); + in->inode.size = MAX(in->inode.size, p->second.size); + in->inode.mtime = MAX(in->inode.mtime, p->second.mtime); + in->inode.atime = MAX(in->inode.atime, p->second.atime); + + reconnected_open_files.insert(in); } - if (event) { - server->commit_request(req, reply, tracei, event); + } + + // remove from gather set + client_reconnect_gather.erase(from); + if (client_reconnect_gather.empty()) reconnect_finish(); + + delete m; +} + +void Server::client_reconnect_failure(int from) +{ + dout(5) << "client_reconnect_failure on client" << from << endl; + client_reconnect_gather.erase(from); + if (client_reconnect_gather.empty()) + reconnect_finish(); +} + +void Server::reconnect_finish() +{ + dout(7) << "reconnect_finish" << endl; + + // adjust filelock state appropriately + for (set::iterator p = reconnected_open_files.begin(); + p != reconnected_open_files.end(); + ++p) { + CInode *in = *p; + int issued = in->get_caps_issued(); + if (in->is_auth()) { + // wr? + if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { + if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { + in->filelock.set_state(LOCK_LONER); + } else { + in->filelock.set_state(LOCK_MIXED); + } + } } else { - // reply. - server->reply_request(req, reply, tracei); + // note that client should perform stale/reap cleanup during reconnect. + assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? + if (in->filelock.is_xlocked()) + in->filelock.set_state(LOCK_LOCK); + else + in->filelock.set_state(LOCK_SYNC); // might have been lock, previously } + dout(10) << " issued " << cap_string(issued) + << " chose " << in->filelock + << " on " << *in << endl; } -}; + reconnected_open_files.clear(); // clean up + + // done + if (mds->mdsmap->get_num_in_mds() == 1) + mds->set_want_state(MDSMap::STATE_ACTIVE); // go active + else + mds->set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state +} + + + +/******* + * some generic stuff for finishing off requests + */ /* * send generic response (just and error code) */ -void Server::reply_request(MClientRequest *req, int r, CInode *tracei) +void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) { - reply_request(req, new MClientReply(req, r), tracei); + MClientRequest *req = mdr->client_request(); + reply_request(mdr, new MClientReply(req, r), tracei); } @@ -183,8 +315,17 @@ void Server::reply_request(MClientRequest *req, int r, CInode *tracei) * send given reply * include a trace to tracei */ -void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) { - dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl; +void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) +{ + MClientRequest *req = mdr->client_request(); + + dout(10) << "reply_request " << reply->get_result() + << " (" << strerror(-reply->get_result()) + << ") " << *req << endl; + + // note result code in clientmap? + if (!req->is_idempotent()) + mds->clientmap.add_completed_request(mdr->reqid); // include trace if (tracei) { @@ -192,80 +333,23 @@ void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tra } // send reply - messenger->send_message(reply, - req->get_client_inst()); - - // discard request - mdcache->request_finish(req); - - // stupid stats crap (FIXME) - stat_ops++; -} - - -void Server::submit_update(MClientRequest *req, - CInode *wrlockedi, - LogEvent *event, - Context *oncommit) -{ - // log - mdlog->submit_entry(event); - - // pin - mdcache->request_pin_inode(req, wrlockedi); - - // wait - mdlog->wait_for_sync(oncommit); + messenger->send_message(reply, req->get_client_inst()); + + // finish request + mdcache->request_finish(mdr); } -/* - * commit event(s) to the metadata journal, then reply. - * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) - * - * NOTE: this is old and bad (write-behind!) - */ -void Server::commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2) -{ - // log - if (event) mdlog->submit_entry(event); - if (event2) mdlog->submit_entry(event2); - - if (g_conf.mds_log_before_reply && g_conf.mds_log && event) { - // SAFE mode! - - // pin inode so it doesn't go away! - if (tracei) mdcache->request_pin_inode(req, tracei); - - // wait for log sync - mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); - return; - } - else { - // just reply - reply_request(req, reply, tracei); - } -} /*** * process a client request */ - void Server::handle_client_request(MClientRequest *req) { - dout(4) << "req " << *req << endl; - - // note original client addr - if (req->get_source().is_client()) { - req->set_client_inst( req->get_source_inst() ); - req->clear_payload(); - } + dout(4) << "handle_client_request " << *req << endl; + int client = req->get_client(); if (!mds->is_active()) { dout(5) << " not active, discarding client request." << endl; @@ -279,347 +363,632 @@ void Server::handle_client_request(MClientRequest *req) return; } + // active session? + if (!mds->clientmap.have_session(client)) { + dout(1) << "no session for client" << client << ", dropping" << endl; + delete req; + return; + } + + // okay, i want CInode *ref = 0; - vector trace; // might be blank, for fh guys - bool follow_trailing_symlink = false; + // retry? + if (req->get_retry_attempt()) { + if (mds->clientmap.have_completed_request(req->get_reqid())) { + dout(5) << "already completed " << req->get_reqid() << endl; + mds->messenger->send_message(new MClientReply(req, 0), + req->get_client_inst()); + delete req; + return; + } + } + // trim completed_request list + if (req->get_oldest_client_tid() > 0) + mds->clientmap.trim_completed_requests(client, + req->get_oldest_client_tid()); + - // operations on fh's or other non-files + // ----- + // some ops are on ino's switch (req->get_op()) { - /* case MDS_OP_FSTAT: - reply = handle_client_fstat(req, cur); - break; ****** fiX ME *** - */ + ref = mdcache->get_inode(req->args.fstat.ino); + assert(ref); + break; case MDS_OP_TRUNCATE: - if (!req->get_ino()) break; // can be called w/ either fh OR path + if (!req->args.truncate.ino) + break; // can be called w/ either fh OR path + ref = mdcache->get_inode(req->args.truncate.ino); + assert(ref); + break; - case MDS_OP_RELEASE: case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed? - - if (!ref) { - int next = mds->get_nodeid() + 1; - if (next >= mds->mdsmap->get_num_mds()) next = 0; - dout(10) << "got request on ino we don't have, passing buck to " << next << endl; - mds->send_message_mds(req, next, MDS_PORT_SERVER); - return; - } + ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? + assert(ref); + break; } - if (!ref) { - // we need to traverse a path - filepath refpath = req->get_filepath(); - - // ops on non-existing files --> directory paths - switch (req->get_op()) { - case MDS_OP_OPEN: - if (!(req->get_iarg() & O_CREAT)) break; - - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_SYMLINK: - case MDS_OP_LINK: - case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!! - case MDS_OP_RMDIR: - case MDS_OP_RENAME: - // remove last bit of path - refpath = refpath.prefixpath(refpath.depth()-1); - break; - } - dout(10) << "refpath = " << refpath << endl; - - Context *ondelay = new C_MDS_RetryMessage(mds, req); - - if (req->get_op() == MDS_OP_LSTAT) { - follow_trailing_symlink = false; - } - - // do trace - int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink, - req, ondelay, - MDS_TRAVERSE_FORWARD, - 0, - true); // is MClientRequest - - if (r > 0) return; // delayed - if (r == -ENOENT || - r == -ENOTDIR || - r == -EISDIR) { - // error! - dout(10) << " path traverse error " << r << ", replying" << endl; - - // send error - messenger->send_message(new MClientReply(req, r), - req->get_client_inst()); - - // - // is this a special debug command? - if (refpath.depth() - 1 == trace.size() && - refpath.last_bit().find(".ceph.") == 0) { - CDir *dir = 0; - if (trace.empty()) - dir = mdcache->get_root()->dir; - else - dir = trace[trace.size()-1]->get_inode()->dir; - - dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl; - - if (refpath.last_bit() == ".ceph.hash" && - refpath.depth() > 1) { - dout(1) << "got explicit hash command " << refpath << endl; - CDir *dir = trace[trace.size()-1]->get_inode()->dir; - if (!dir->is_hashed() && - !dir->is_hashing() && - dir->is_auth()) - mdcache->migrator->hash_dir(dir); - } - else if (refpath.last_bit() == ".ceph.commit") { - dout(1) << "got explicit commit command on " << *dir << endl; - mds->mdstore->commit_dir(dir, 0); - } - } - // - + // register + dispatch + MDRequest *mdr = mdcache->request_start(req); - delete req; - return; - } - - if (trace.size()) - ref = trace[trace.size()-1]->inode; - else - ref = mdcache->get_root(); + if (ref) { + dout(10) << "inode op on ref " << *ref << endl; + mdr->ref = ref; + mdr->pin(ref); } - - dout(10) << "ref is " << *ref << endl; - - // rename doesn't pin src path (initially) - if (req->get_op() == MDS_OP_RENAME) trace.clear(); - // register - if (!mdcache->request_start(req, ref, trace)) - return; - - // process - dispatch_request(req, ref); + dispatch_request(mdr); + return; } - -void Server::dispatch_request(Message *m, CInode *ref) +void Server::dispatch_request(MDRequest *mdr) { - MClientRequest *req = 0; - - // MLock or MClientRequest? - /* this is a little weird. - client requests and mlocks both initial dentry xlocks, path pins, etc., - and thus both make use of the context C_MDS_RetryRequest. - */ - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - req = (MClientRequest*)m; - break; // continue below! - - case MSG_MDS_LOCK: - mds->locker->handle_lock_dn((MLock*)m); - return; // done + MClientRequest *req = mdr->client_request(); - default: - assert(0); // shouldn't get here + if (mdr->ref) { + dout(7) << "dispatch_request " << *req << " ref " << *mdr->ref << endl; + } else { + dout(7) << "dispatch_request " << *req << endl; } - // MClientRequest. - - switch(req->get_op()) { - - // files - case MDS_OP_OPEN: - if (req->get_iarg() & O_CREAT) - handle_client_openc(req, ref); - else - handle_client_open(req, ref); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(req, ref); - break; - /* - case MDS_OP_FSYNC: - handle_client_fsync(req, ref); - break; - */ - /* - case MDS_OP_RELEASE: - handle_client_release(req, ref); - break; - */ + switch (req->get_op()) { - // inodes + // inodes ops. case MDS_OP_STAT: case MDS_OP_LSTAT: - handle_client_stat(req, ref); + handle_client_stat(mdr); break; case MDS_OP_UTIME: - handle_client_utime(req, ref); + handle_client_utime(mdr); break; case MDS_OP_CHMOD: - handle_client_chmod(req, ref); + handle_client_chmod(mdr); break; case MDS_OP_CHOWN: - handle_client_chown(req, ref); + handle_client_chown(mdr); + break; + case MDS_OP_TRUNCATE: + handle_client_truncate(mdr); break; - - // namespace case MDS_OP_READDIR: - handle_client_readdir(req, ref); + handle_client_readdir(mdr); break; + case MDS_OP_FSYNC: + //handle_client_fsync(req, ref); + break; + + // funky. + case MDS_OP_OPEN: + if ((req->args.open.flags & O_CREAT) && + !mdr->ref) + handle_client_openc(mdr); + else + handle_client_open(mdr); + break; + + // namespace. + // no prior locks. case MDS_OP_MKNOD: - handle_client_mknod(req, ref); + handle_client_mknod(mdr); break; case MDS_OP_LINK: - handle_client_link(req, ref); + handle_client_link(mdr); break; case MDS_OP_UNLINK: - handle_client_unlink(req, ref); + case MDS_OP_RMDIR: + handle_client_unlink(mdr); break; case MDS_OP_RENAME: - handle_client_rename(req, ref); - break; - case MDS_OP_RMDIR: - handle_client_unlink(req, ref); + handle_client_rename(mdr); break; case MDS_OP_MKDIR: - handle_client_mkdir(req, ref); + handle_client_mkdir(mdr); break; case MDS_OP_SYMLINK: - handle_client_symlink(req, ref); + handle_client_symlink(mdr); break; - default: dout(1) << " unknown client op " << req->get_op() << endl; assert(0); } - - return; } -// FIXME: this probably should go somewhere else. -bool Server::try_open_dir(CInode *in, MClientRequest *req) +// --------------------------------------- +// HELPERS + + +/** validate_dentry_dir + * + * verify that the dir exists and would own the dname. + * do not check if the dentry exists. + */ +CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) { - if (!in->dir && in->is_frozen_dir()) { - // doh! - dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; - assert(in->get_parent_dir()); - in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, in)); + // make sure parent is a dir? + if (!diri->is_dir()) { + dout(7) << "validate_dentry_dir: not a dir" << endl; + reply_request(mdr, -ENOTDIR); return false; } - in->get_or_open_dir(mds->mdcache); - return true; -} + // which dirfrag? + frag_t fg = diri->pick_dirfrag(dname); + CDir *dir = try_open_auth_dir(diri, fg, mdr); + if (!dir) + return 0; + // frozen? + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << endl; + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + return dir; +} +/** prepare_null_dentry + * prepare a null (or existing) dentry in given dir. + * wait for any dn lock. + */ +CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) +{ + dout(10) << "prepare_null_dentry " << dname << " in " << *dir << endl; + assert(dir->is_auth()); + + // does it already exist? + CDentry *dn = dir->lookup(dname); + if (dn) { + if (!dn->lock.can_rdlock(mdr)) { + dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } -// =============================================================================== -// STAT + if (!dn->is_null()) { + // name already exists + dout(10) << "dentry " << dname << " exists in " << *dir << endl; + if (!okexist) { + reply_request(mdr, -EEXIST); + return 0; + } + } -void Server::handle_client_stat(MClientRequest *req, - CInode *ref) -{ - // FIXME: this is really not the way to handle the statlite mask. - - // do I need file info? - int mask = req->get_iarg(); - if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { - // yes. do a full stat. - if (!mds->locker->inode_file_read_start(ref, req)) - return; // syncing - mds->locker->inode_file_read_finish(ref); - } else { - // nope! easy peasy. + return dn; + } + + // make sure dir is complete + if (!dir->is_complete()) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; } - mds->balancer->hit_inode(ref, META_POP_IRD); - - // reply - //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; - MClientReply *reply = new MClientReply(req); - reply_request(req, reply, ref); + // create + dn = dir->add_dentry(dname, 0); + dout(10) << "prepare_null_dentry added " << *dn << endl; + + return dn; } +/** prepare_new_inode + * + * create a new inode. set c/m/atime. hit dir pop. + */ +CInode* Server::prepare_new_inode(MClientRequest *req, CDir *dir) +{ + CInode *in = mdcache->create_inode(); + in->inode.uid = req->get_caller_uid(); + in->inode.gid = req->get_caller_gid(); + in->inode.ctime = in->inode.mtime = in->inode.atime = g_clock.real_now(); // now + dout(10) << "prepare_new_inode " << *in << endl; + // bump modify pop + mds->balancer->hit_dir(dir, META_POP_DWR); -// =============================================================================== -// INODE UPDATES + return in; +} -/* - * finisher: do a inode_file_write_finish and reply. - */ -class C_MDS_utime_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - time_t mtime, atime; -public: - C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : - mds(m), req(r), in(i), - pv(pdv), - mtime(mt), atime(at) { } - void finish(int r) { - assert(r == 0); - // apply - in->inode.mtime = mtime; - in->inode.atime = atime; - in->mark_dirty(pv); +CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) +{ + // figure parent dir vs dname + if (refpath.depth() == 0) { + dout(7) << "can't do that to root" << endl; + reply_request(mdr, -EINVAL); + return 0; + } + string dname = refpath.last_dentry(); + refpath.pop_dentry(); + + dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << endl; + + // traverse to parent dir + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + int r = mdcache->path_traverse(mdr, + 0, + refpath, trace, true, + mdr->request, ondelay, + MDS_TRAVERSE_FORWARD, + true); // is MClientRequest + if (r > 0) return 0; // delayed + if (r < 0) { + reply_request(mdr, r); + return 0; + } + + // open inode + CInode *diri; + if (trace.empty()) + diri = mdcache->get_root(); + else + diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); + if (!diri) + return 0; // opening inode. + + // is it an auth dir? + CDir *dir = validate_dentry_dir(mdr, diri, dname); + if (!dir) + return 0; // forwarded or waiting for freeze + + dout(10) << "traverse_to_auth_dir " << *dir << endl; + return dir; +} + + + +CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) +{ + // already got ref? + if (mdr->ref) + return mdr->ref; + + MClientRequest *req = mdr->client_request(); + + // traverse + filepath refpath = req->get_filepath(); + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + vector trace; + int r = mdcache->path_traverse(mdr, 0, + refpath, trace, req->follow_trailing_symlink(), + req, ondelay, + MDS_TRAVERSE_FORWARD, + true); // is MClientRequest + if (r > 0) return false; // delayed + if (r < 0) { // error + reply_request(mdr, r); + return 0; + } + + // open ref inode + CInode *ref = 0; + if (trace.empty()) + ref = mdcache->get_root(); + else { + CDentry *dn = trace[trace.size()-1]; + + // if no inode, fw to dentry auth? + if (want_auth && + dn->is_remote() && + !dn->inode && + !dn->is_auth()) { + if (dn->is_ambiguous_auth()) { + dout(10) << "waiting for single auth on " << *dn << endl; + dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "fw to auth for " << *dn << endl; + mds->forward_message_mds(req, dn->authority().first, MDS_PORT_SERVER); + } + } + + // open ref inode + ref = mdcache->get_dentry_inode(dn, mdr); + if (!ref) return 0; + } + dout(10) << "ref is " << *ref << endl; + + // fw to inode auth? + if (want_auth && !ref->is_auth()) { + if (ref->is_ambiguous_auth()) { + dout(10) << "waiting for single auth on " << *ref << endl; + ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "fw to auth for " << *ref << endl; + mds->forward_message_mds(req, ref->authority().first, MDS_PORT_SERVER); + } + } + + // auth_pin? + if (want_auth) { + if (!ref->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *ref << endl; + ref->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + mdr->auth_pin(ref); + } + + // lock the path + set rdlocks, empty; + + for (unsigned i=0; ilock); + + if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) + return 0; + + // set and pin ref + mdr->pin(ref); + mdr->ref = ref; + + // save the locked trace. + mdr->trace.swap(trace); + + return ref; +} + + +/** rdlock_path_xlock_dentry + * traverse path to the directory that could/would contain dentry. + * make sure i am auth for that dentry, forward as necessary. + * create null dentry in place (or use existing if okexist). + * get rdlocks on traversed dentries, xlock on new dentry. + */ +CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) +{ + MClientRequest *req = mdr->client_request(); + + vector trace; + CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); + if (!dir) return 0; + dout(10) << "rdlock_path_xlock_dentry dir " << *dir << endl; + + // make sure we can auth_pin dir + if (!dir->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *dir << endl; + dir->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // make a null dentry? + const string &dname = req->get_filepath().last_dentry(); + CDentry *dn; + if (mustexist) { + dn = dir->lookup(dname); + + // make sure dir is complete + if (!dn && !dir->is_complete()) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // readable? + if (dn && !dn->lock.can_rdlock(mdr)) { + dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // exists? + if (!dn || dn->is_null()) { + dout(7) << "dentry " << dname << " dne in " << *dir << endl; + reply_request(mdr, -ENOENT); + return 0; + } + } else { + dn = prepare_null_dentry(mdr, dir, dname, okexist); + if (!dn) + return 0; + } + + // -- lock -- + set rdlocks, wrlocks, xlocks; + + for (unsigned i=0; ilock); + if (dn->is_null()) { + xlocks.insert(&dn->lock); // new dn, xlock + wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime + } else + rdlocks.insert(&dn->lock); // existing dn, rdlock + + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return 0; + + // save the locked trace. + mdr->trace.swap(trace); + + return dn; +} + + + + + +CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr) +{ + CDir *dir = diri->get_dirfrag(fg); + + // not open and inode not mine? + if (!dir && !diri->is_auth()) { + int inauth = diri->authority().first; + dout(7) << "try_open_auth_dir: not open, not inode auth, fw to mds" << inauth << endl; + mdcache->request_forward(mdr, inauth); + return 0; + } + + // not open and inode frozen? + if (!dir && diri->is_frozen_dir()) { + dout(10) << "try_open_auth_dir: dir inode is frozen, waiting " << *diri << endl; + assert(diri->get_parent_dir()); + diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // invent? + if (!dir) { + assert(diri->is_auth()); + dir = diri->get_or_open_dirfrag(mds->mdcache, fg); + } + assert(dir); + + // am i auth for the dirfrag? + if (!dir->is_auth()) { + int auth = dir->authority().first; + dout(7) << "try_open_auth_dir: not auth for " << *dir + << ", fw to mds" << auth << endl; + mdcache->request_forward(mdr, auth); + return 0; + } + + return dir; +} + +/* +CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) +{ + CDir *dir = diri->get_dirfrag(fg); + if (dir) + return dir; + + if (diri->is_auth()) { + // auth + // not open and inode frozen? + if (!dir && diri->is_frozen_dir()) { + dout(10) << "try_open_dir: dir inode is auth+frozen, waiting " << *diri << endl; + assert(diri->get_parent_dir()); + diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // invent? + if (!dir) { + assert(diri->is_auth()); + dir = diri->get_or_open_dirfrag(mds->mdcache, fg); + } + assert(dir); + return dir; + } else { + // not auth + mdcache->open_remote_dir(diri, fg, + new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } +} +*/ + +// =============================================================================== +// STAT + +void Server::handle_client_stat(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); + CInode *ref = rdlock_path_pin_ref(mdr, false); + if (!ref) return; + + // which inode locks do I want? + /* note: this works because we include existing locks in our lists, + and because all new locks are on inodes and sort to the right of + the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ + set rdlocks = mdr->rdlocks; + set wrlocks = mdr->wrlocks; + set xlocks = mdr->xlocks; + + int mask = req->args.stat.mask; + if (mask & INODE_MASK_LINK) rdlocks.insert(&ref->linklock); + if (mask & INODE_MASK_AUTH) rdlocks.insert(&ref->authlock); + if (ref->is_file() && + mask & INODE_MASK_FILE) rdlocks.insert(&ref->filelock); + if (ref->is_dir() && + mask & INODE_MASK_MTIME) rdlocks.insert(&ref->dirlock); + + mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks); + + // reply + dout(10) << "reply to stat on " << *req << endl; + MClientReply *reply = new MClientReply(req); + reply_request(mdr, reply, ref); +} + + + + +// =============================================================================== +// INODE UPDATES + + +/* + * finisher: do a inode_file_write_finish and reply. + */ +class C_MDS_utime_finish : public Context { + MDS *mds; + MDRequest *mdr; + CInode *in; + version_t pv; + utime_t mtime, atime; +public: + C_MDS_utime_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t mt, utime_t at) : + mds(m), mdr(r), in(i), + pv(pdv), + mtime(mt), atime(at) { } + void finish(int r) { + assert(r == 0); - // unlock - mds->locker->inode_file_write_finish(in); + // apply + in->inode.mtime = mtime; + in->inode.atime = atime; + in->mark_dirty(pv); // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; // utime -void Server::handle_client_utime(MClientRequest *req, - CInode *cur) +void Server::handle_client_utime(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = rdlock_path_pin_ref(mdr, true); + if (!cur) return; + // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) sync + if (!mds->locker->xlock_start(&cur->filelock, mdr)) + return; mds->balancer->hit_inode(cur, META_POP_IWR); // prepare version_t pdv = cur->pre_dirty(); - time_t mtime = req->get_targ(); - time_t atime = req->get_targ2(); - C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv, + utime_t mtime = req->args.utime.mtime; + utime_t atime = req->args.utime.atime; + C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, mdr, cur, pdv, mtime, atime); // log + wait EUpdate *le = new EUpdate("utime"); + le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mtime = mtime; pi->atime = mtime; + pi->ctime = g_clock.real_now(); pi->version = pdv; mdlog->submit_entry(le); @@ -630,17 +999,17 @@ void Server::handle_client_utime(MClientRequest *req, // -------------- /* - * finisher: do a inode_hard_write_finish and reply. + * finisher: do a inode_hard_xlock_finish and reply. */ class C_MDS_chmod_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CInode *in; version_t pv; int mode; public: - C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) : - mds(m), req(r), in(i), pv(pdv), mode(mo) { } + C_MDS_chmod_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int mo) : + mds(m), mdr(r), in(i), pv(pdv), mode(mo) { } void finish(int r) { assert(r == 0); @@ -649,40 +1018,42 @@ public: in->inode.mode |= (mode & 04777); in->mark_dirty(pv); - // unlock - mds->locker->inode_hard_write_finish(in); - // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; // chmod -void Server::handle_client_chmod(MClientRequest *req, - CInode *cur) +void Server::handle_client_chmod(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = rdlock_path_pin_ref(mdr, true); + if (!cur) return; + // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock + if (!mds->locker->xlock_start(&cur->authlock, mdr)) + return; mds->balancer->hit_inode(cur, META_POP_IWR); // prepare version_t pdv = cur->pre_dirty(); - int mode = req->get_iarg(); - C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv, + int mode = req->args.chmod.mode; + C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, mdr, cur, pdv, mode); // log + wait EUpdate *le = new EUpdate("chmod"); + le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mode = mode; pi->version = pdv; + pi->ctime = g_clock.real_now(); mdlog->submit_entry(le); mdlog->wait_for_sync(fin); @@ -693,13 +1064,13 @@ void Server::handle_client_chmod(MClientRequest *req, class C_MDS_chown_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CInode *in; version_t pv; int uid, gid; public: - C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) : - mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { } + C_MDS_chown_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int u, int g) : + mds(m), mdr(r), in(i), pv(pdv), uid(u), gid(g) { } void finish(int r) { assert(r == 0); @@ -708,40 +1079,42 @@ public: if (gid >= 0) in->inode.gid = gid; in->mark_dirty(pv); - // unlock - mds->locker->inode_hard_write_finish(in); - // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; -void Server::handle_client_chown(MClientRequest *req, - CInode *cur) +void Server::handle_client_chown(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = rdlock_path_pin_ref(mdr, true); + if (!cur) return; + // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock + if (!mds->locker->xlock_start(&cur->authlock, mdr)) + return; mds->balancer->hit_inode(cur, META_POP_IWR); // prepare version_t pdv = cur->pre_dirty(); - int uid = req->get_iarg(); - int gid = req->get_iarg2(); - C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv, + int uid = req->args.chown.uid; + int gid = req->args.chown.gid; + C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, mdr, cur, pdv, uid, gid); // log + wait EUpdate *le = new EUpdate("chown"); + le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); inode_t *pi = le->metablob.add_dentry(cur->parent, true); if (uid >= 0) pi->uid = uid; if (gid >= 0) pi->gid = gid; pi->version = pdv; + pi->ctime = g_clock.real_now(); mdlog->submit_entry(le); mdlog->wait_for_sync(fin); @@ -750,9 +1123,6 @@ void Server::handle_client_chown(MClientRequest *req, - - - // ================================================================= // DIRECTORY and NAMESPACE OPS @@ -769,11 +1139,6 @@ int Server::encode_dir_contents(CDir *dir, it++) { CDentry *dn = it->second; - // hashed? - if (dir->is_hashed() && - mds->get_nodeid() != mds->mdcache->hash_dentry( dir->ino(), it->first )) - continue; - if (dn->is_null()) continue; CInode *in = dn->inode; @@ -792,309 +1157,134 @@ int Server::encode_dir_contents(CDir *dir, } -/* - * note: this is pretty sloppy, but should work just fine i think... - */ -void Server::handle_hash_readdir(MHashReaddir *m) +void Server::handle_client_readdir(MDRequest *mdr) { - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); + MClientRequest *req = mdr->client_request(); + CInode *diri = rdlock_path_pin_ref(mdr, false); + if (!diri) return; - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; + // it's a directory, right? + if (!diri->is_dir()) { + // not a dir + dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; + reply_request(mdr, -ENOTDIR); + return; } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - // complete? - if (!dir->is_complete()) { - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m)); + // which frag? + frag_t fg = req->args.readdir.frag; + + // does it exist? + if (diri->dirfragtree[fg] != fg) { + dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; + reply_request(mdr, -EAGAIN); return; - } - - // get content - list inls; - list dnls; - int num = encode_dir_contents(dir, inls, dnls); + } - // sent it back! - messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), - m->get_source_inst(), MDS_PORT_CACHE); -} - + CDir *dir = try_open_auth_dir(diri, fg, mdr); + if (!dir) return; -void Server::handle_hash_readdir_reply(MHashReaddirReply *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); + // ok! + assert(dir->is_auth()); - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // move items to hashed_readdir gather - int from = m->get_source().num(); - assert(dir->hashed_readdir.count(from) == 0); - dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), - m->get_in()); - dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), - m->get_dn()); - delete m; + // check perm + /* + if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) + return; + mds->locker->inode_hard_rdlock_finish(diri, mdr); + */ - // gather finished? - if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) { - dout(7) << "still waiting for more hashed readdir bits" << endl; + if (!dir->is_complete()) { + // fetch + dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return; } + + // build dir contents + list inls; + list dnls; + int numfiles = encode_dir_contents(dir, inls, dnls); - dout(7) << "got last bit! finishing waiters" << endl; + // . too + dnls.push_back("."); + inls.push_back(new InodeStat(diri, mds->get_nodeid())); + ++numfiles; - // do these finishers. they'll copy the results. - list finished; - dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished); - finish_contexts(finished); + // yay, reply + MClientReply *reply = new MClientReply(req); + reply->take_dir_items(inls, dnls, numfiles); - // now discard these results - for (map, list > >::iterator it = dir->hashed_readdir.begin(); - it != dir->hashed_readdir.end(); - it++) { - for (list::iterator ci = it->second.first.begin(); - ci != it->second.first.end(); - ci++) - delete *ci; - } - dir->hashed_readdir.clear(); + dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; + reply->set_result(fg); - // unpin dir (we're done!) - dir->auth_unpin(); + //balancer->hit_dir(diri->dir); - // trigger any waiters for next hashed readdir cycle - dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue); + // reply + reply_request(mdr, reply, diri); } -class C_MDS_HashReaddir : public Context { - Server *server; - MClientRequest *req; - CDir *dir; -public: - C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) { - this->server = server; - this->req = req; - this->dir = dir; - } - void finish(int r) { - server->finish_hash_readdir(req, dir); - } -}; -void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) -{ - dout(7) << "finish_hash_readdir on " << *dir << endl; +// ------------------------------------------------ - assert(dir->is_hashed()); - assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds()); +// MKNOD - // reply! - MClientReply *reply = new MClientReply(req); - reply->set_result(0); +class C_MDS_mknod_finish : public Context { + MDS *mds; + MDRequest *mdr; + CDentry *dn; + CInode *newi; + version_t pv; +public: + C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : + mds(m), mdr(r), dn(d), newi(ni), + pv(d->get_projected_version()) {} + void finish(int r) { + assert(r == 0); - for (int i=0; imdsmap->get_num_mds(); i++) { - reply->copy_dir_items(dir->hashed_readdir[i].first, - dir->hashed_readdir[i].second); - } + // link the inode + dn->get_dir()->link_inode(dn, newi); + + // dirty inode, dn, dir + newi->mark_dirty(pv); - // ok! - reply_request(req, reply, dir->inode); -} + // dir inode's mtime + dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, + newi->inode.ctime); + // hit pop + mds->balancer->hit_inode(newi, META_POP_IWR); -void Server::handle_client_readdir(MClientRequest *req, - CInode *cur) -{ - // it's a directory, right? - if (!cur->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(req, -ENOTDIR); - return; + // reply + MClientReply *reply = new MClientReply(mdr->client_request(), 0); + reply->set_result(0); + mds->server->reply_request(mdr, reply, newi); } +}; - // auth? - if (!cur->dir_is_auth()) { - int dirauth = cur->authority(); - if (cur->dir) - dirauth = cur->dir->authority(); - assert(dirauth >= 0); - assert(dirauth != mds->get_nodeid()); - - // forward to authority - dout(10) << " forwarding readdir to authority " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } +void Server::handle_client_mknod(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); - if (!try_open_dir(cur, req)) - return; - assert(cur->dir->is_auth()); - - // unhashing? wait! - if (cur->dir->is_hashed() && - cur->dir->is_unhashing()) { - dout(10) << "unhashing, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, cur)); - return; - } + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; - // check perm - if (!mds->locker->inode_hard_read_start(cur,req)) - return; - mds->locker->inode_hard_read_finish(cur); - - CDir *dir = cur->dir; - assert(dir); - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (dir->is_hashed()) { - // HASHED - dout(7) << "hashed dir" << endl; - if (!dir->can_auth_pin()) { - dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (!dir->hashed_readdir.empty()) { - dout(7) << "another readdir gather in progres, waiting" << endl; - dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // start new readdir gather - dout(7) << "staring new hashed readdir gather" << endl; - - // pin auth for process! - dir->auth_pin(); - - // get local bits - encode_dir_contents(cur->dir, - dir->hashed_readdir[mds->get_nodeid()].first, - dir->hashed_readdir[mds->get_nodeid()].second); - - // request other bits - for (int i=0; imdsmap->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER); - } - - // wait - dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, - new C_MDS_HashReaddir(this, req, dir)); - } else { - // NON-HASHED - // build dir contents - list inls; - list dnls; - int numfiles = encode_dir_contents(cur->dir, inls, dnls); - - // . too - dnls.push_back("."); - inls.push_back(new InodeStat(cur, mds->get_nodeid())); - ++numfiles; - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(0); - - //balancer->hit_dir(cur->dir); - - // reply - reply_request(req, reply, cur); - } -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MClientRequest *req; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // unlock - mds->locker->dentry_xlock_finish(dn); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, newi); - } -}; - -void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); - assert(dn); // it's a file. - newi->inode.mode = req->get_iarg(); + dn->pre_dirty(); + newi->inode.mode = req->args.mknod.mode; newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_FILE; // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mknod"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); + le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_dir_context(dn->dir); + inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); // log + wait @@ -1104,176 +1294,39 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -/* - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& name) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_new_dentry_dir: not a dir" << endl; - reply_request(req, -ENOTDIR); - return false; - } - - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl; - mdcache->request_forward(req, dirauth); - return false; - } - - if (!try_open_dir(diri, req)) - return false; - CDir *dir = diri->dir; - - // make sure it's my dentry - int dnauth = dir->dentry_authority(name); - if (dnauth != mds->get_nodeid()) { - // fw - dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir - << " dn " << name - << " not mine, fw to " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return false; - } - - // dir auth pinnable? - if (!dir->can_auth_pin()) { - dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << endl; - dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - return dir; -} - -/* - * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). - * create the inode and dentry, but do not link them. - * pre_dirty the dentry+dir. - * xlock the dentry. - * - * return val - * 0 - wait for something - * 1 - created - * 2 - already exists (only if okexist=true) - */ -int Server::prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist) -{ - dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; - - // get containing directory (without last bit) - filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); - string name = req->get_filepath().last_bit(); - - CDir *dir = validate_new_dentry_dir(req, diri, name); - if (!dir) return 0; - - // make sure name doesn't already exist - *pdn = dir->lookup(name); - if (*pdn) { - if (!(*pdn)->can_read(req)) { - dout(10) << "waiting on (existing!) dentry " << **pdn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - if (!(*pdn)->is_null()) { - // name already exists - if (okexist) { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - *pin = (*pdn)->inode; - return 2; - } else { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - reply_request(req, -EEXIST); - return 0; - } - } - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - // make sure dir is pinnable - - - // create inode - *pin = mdcache->create_inode(); - (*pin)->inode.uid = req->get_caller_uid(); - (*pin)->inode.gid = req->get_caller_gid(); - (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now - // note: inode.version will get set by finisher's mark_dirty. - - // create dentry - if (!*pdn) - *pdn = dir->add_dentry(name, 0); - - (*pdn)->pre_dirty(); - - // xlock dentry - bool res = mds->locker->dentry_xlock_start(*pdn, req, diri); - assert(res == true); - - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - - return 1; -} - - - - - // MKDIR -void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) +void Server::handle_client_mkdir(MDRequest *mdr) { - CInode *newi = 0; - CDentry *dn = 0; + MClientRequest *req = mdr->client_request(); - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; + + // new inode + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); - assert(dn); // it's a directory. - newi->inode.mode = req->get_iarg(); + dn->pre_dirty(); + newi->inode.mode = req->args.mkdir.mode; newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_DIR; newi->inode.layout = g_OSD_MDDirLayout; // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dir(mds->mdcache); + CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); newdir->mark_complete(); newdir->mark_dirty(newdir->pre_dirty()); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mkdir"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); + le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_dir_context(dn->dir); + inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); - le->metablob.add_dir(newi->dir, true); + le->metablob.add_dir(newdir, true); // log + wait mdlog->submit_entry(le); @@ -1296,30 +1349,30 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) } - // SYMLINK -void Server::handle_client_symlink(MClientRequest *req, CInode *diri) +void Server::handle_client_symlink(MDRequest *mdr) { - CInode *newi = 0; - CDentry *dn = 0; + MClientRequest *req = mdr->client_request(); + + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); - assert(dn); // it's a symlink + dn->pre_dirty(); newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_SYMLINK; newi->symlink = req->get_sarg(); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("symlink"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); + le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_dir_context(dn->dir); + inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); // log + wait @@ -1333,59 +1386,213 @@ void Server::handle_client_symlink(MClientRequest *req, CInode *diri) // LINK -class C_MDS_LinkTraverse : public Context { - Server *server; - MClientRequest *req; - CInode *ref; -public: - vector trace; - C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) { - this->server = server; - this->req = req; - this->ref = ref; +void Server::handle_client_link(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); + + dout(7) << "handle_client_link " << req->get_filepath() + << " to " << req->get_sarg() + << endl; + + // traverse to dest dir, make sure it's ours. + const filepath &linkpath = req->get_filepath(); + const string &dname = linkpath.last_dentry(); + vector linktrace; + CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); + if (!dir) return; + dout(7) << "handle_client_link link " << dname << " in " << *dir << endl; + + // traverse to link target + filepath targetpath = req->get_sarg(); + dout(7) << "handle_client_link discovering target " << targetpath << endl; + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + vector targettrace; + int r = mdcache->path_traverse(mdr, 0, + targetpath, targettrace, false, + req, ondelay, + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; // wait + if (targettrace.empty()) r = -EINVAL; + if (r < 0) { + reply_request(mdr, r); + return; + } + + // identify target inode + CInode *targeti = targettrace[targettrace.size()-1]->inode; + assert(targeti); + + // dir? + dout(7) << "target is " << *targeti << endl; + if (targeti->is_dir()) { + dout(7) << "target is a dir, failing..." << endl; + reply_request(mdr, -EINVAL); + return; } + + // does the target need an anchor? + if (targeti->is_auth()) { + /*if (targeti->get_parent_dir() == dn->dir) { + dout(7) << "target is in the same dirfrag, sweet" << endl; + } + else + */ + if (targeti->is_anchored() && !targeti->is_unanchoring()) { + dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; + } + else { + dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; + + mdcache->anchor_create(targeti, + new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + // can we create the dentry? + CDentry *dn = 0; + + // make null link dentry + dn = prepare_null_dentry(mdr, dir, dname, false); + if (!dn) return; + + // create lock lists + set rdlocks, wrlocks, xlocks; + + for (unsigned i=0; ilock); + xlocks.insert(&dn->lock); + wrlocks.insert(&dn->dir->inode->dirlock); + for (unsigned i=0; ilock); + xlocks.insert(&targeti->linklock); + + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + // go! + + // local or remote? + if (targeti->is_auth()) + _link_local(mdr, dn, targeti); + else + _link_remote(mdr, dn, targeti); +} + + +class C_MDS_link_local_finish : public Context { + MDS *mds; + MDRequest *mdr; + CDentry *dn; + CInode *targeti; + version_t dpv; + utime_t tctime; + version_t tpv; +public: + C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, utime_t ct) : + mds(m), mdr(r), dn(d), targeti(ti), + dpv(d->get_projected_version()), + tctime(ct), + tpv(targeti->get_parent_dn()->get_projected_version()) {} void finish(int r) { - server->handle_client_link_2(r, req, ref, trace); + assert(r == 0); + mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv); } }; -void Server::handle_client_link(MClientRequest *req, CInode *ref) + +void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) { - // figure out name - string dname = req->get_filepath().last_bit(); - dout(7) << "handle_client_link dname is " << dname << endl; + dout(10) << "_link_local " << *dn << " to " << *targeti << endl; + + // ok, let's do it. + // prepare log entry + EUpdate *le = new EUpdate("link_local"); + le->metablob.add_client_req(mdr->reqid); + + // predirty + dn->pre_dirty(); + version_t tpdv = targeti->pre_dirty(); - // validate dir - CDir *dir = validate_new_dentry_dir(req, ref, dname); - if (!dir) return; + // add to event + le->metablob.add_dir_context(dn->get_dir()); + le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote + le->metablob.add_dir_context(targeti->get_parent_dir()); + inode_t *pi = le->metablob.add_primary_dentry(targeti->parent, true, targeti); // update old primary + + // update journaled target inode + pi->nlink++; + pi->ctime = g_clock.real_now(); + pi->version = tpdv; + + // finisher + C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, pi->ctime); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); +} - // dentry exists? - CDentry *dn = dir->lookup(dname); - if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { - dout(7) << "handle_client_link dn exists " << *dn << endl; - reply_request(req, -EEXIST); - return; - } +void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, + version_t dpv, utime_t tctime, version_t tpv) +{ + dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; - // xlock dentry - if (!dn->is_xlockedbyme(req)) { - if (!mds->locker->dentry_xlock_start(dn, req, ref)) - return; - } + // link and unlock the new dentry + dn->dir->link_inode(dn, targeti->ino()); + dn->set_version(dpv); + dn->mark_dirty(dpv); + + // update the target + targeti->inode.nlink++; + targeti->inode.ctime = tctime; + targeti->mark_dirty(tpv); + + // dir inode's mtime + dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, + tctime); + + // bump target popularity + mds->balancer->hit_inode(targeti, META_POP_IWR); + + // reply + MClientReply *reply = new MClientReply(mdr->client_request(), 0); + reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref +} - // discover link target - filepath target = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << target << endl; - C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); + + +void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) +{ + dout(10) << "_link_remote " << *dn << " to " << *targeti << endl; - mdcache->path_traverse(target, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); + // 1. send LinkPrepare to dest (journal nlink++ prepare) + // 2. create+journal new dentry, as with link_local. + // 3. send LinkCommit to dest (journals commit) + + // IMPLEMENT ME + reply_request(mdr, -EXDEV); } +/* +void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, + CDentry *dn, CInode *targeti) +{ + // create remote link + dn->dir->link_inode(dn, targeti->ino()); + dn->link_remote( targeti ); // since we have it + dn->_mark_dirty(); // fixme + + mds->balancer->hit_dir(dn->dir, META_POP_DWR); + + // done! + commit_request(req, new MClientReply(req, 0), ref, + 0); // FIXME i should log something +} +*/ + +/* class C_MDS_RemoteLink : public Context { Server *server; MClientRequest *req; @@ -1416,280 +1623,314 @@ public: } }; -void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector& trace) -{ - // target dne? - if (r < 0) { - dout(7) << "target " << req->get_sarg() << " dne" << endl; - reply_request(req, r); - return; - } - assert(r == 0); - - CInode *targeti = mdcache->get_root(); - if (trace.size()) targeti = trace[trace.size()-1]->inode; - assert(targeti); - // dir? - dout(7) << "target is " << *targeti << endl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing" << endl; - reply_request(req, -EINVAL); - return; - } - - // what was the new dentry again? - CDir *dir = diri->dir; - assert(dir); - string dname = req->get_filepath().last_bit(); - CDentry *dn = dir->lookup(dname); - assert(dn); - assert(dn->is_xlockedbyme(req)); - - - // ok! - if (targeti->is_auth()) { - // mine - - // same dir? - if (targeti->get_parent_dir() == dn->get_dir()) { - dout(7) << "target is in the same dir, sweet" << endl; - } - else if (targeti->is_anchored()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } else { - assert(targeti->inode.nlink == 1); - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - - mdcache->anchor_inode(targeti, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // ok, inc link! - targeti->inode.nlink++; - dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl; - targeti->_mark_dirty(); // fixme - } else { // remote: send nlink++ request, wait dout(7) << "target is remote, sending InodeLink" << endl; - mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority(), MDS_PORT_CACHE); + mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority().first, MDS_PORT_CACHE); // wait - targeti->add_waiter(CINODE_WAIT_LINK, - new C_MDS_RemoteLink(this, req, diri, dn, targeti)); + targeti->add_waiter(CInode::WAIT_LINK, new C_MDS_RemoteLink(this, req, diri, dn, targeti)); return; } - handle_client_link_finish(req, diri, dn, targeti); -} +*/ + -void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti) -{ - // create remote link - dn->dir->link_inode(dn, targeti->ino()); - dn->link_remote( targeti ); // since we have it - dn->_mark_dirty(); // fixme - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - // done! - commit_request(req, new MClientReply(req, 0), ref, - 0); // FIXME i should log something -} // UNLINK -void Server::handle_client_unlink(MClientRequest *req, - CInode *diri) +void Server::handle_client_unlink(MDRequest *mdr) { - // rmdir or unlink - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - // find it - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rmdir root" << endl; - reply_request(req, -EINVAL); - return; - } - string name = req->get_filepath().last_bit(); - - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "not a dir" << endl; - reply_request(req, -ENOTDIR); - return; - } + MClientRequest *req = mdr->client_request(); - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; - mdcache->request_forward(req, dirauth); + // traverse to path + vector trace; + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + int r = mdcache->path_traverse(mdr, 0, + req->get_filepath(), trace, false, + req, ondelay, + MDS_TRAVERSE_FORWARD); + if (r > 0) return; + if (trace.empty()) r = -EINVAL; // can't unlink root + if (r < 0) { + reply_request(mdr, r); return; } - - if (!try_open_dir(diri, req)) return; - CDir *dir = diri->dir; - int dnauth = dir->dentry_authority(name); - // does it exist? - CDentry *dn = dir->lookup(name); - if (!dn) { - if (dnauth == mds->get_nodeid()) { - dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; - reply_request(req, -ENOENT); - } else { - // send to authority! - dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl; - mdcache->request_forward(req, dnauth); - } - return; - } + CDentry *dn = trace[trace.size()-1]; + assert(dn); - // have it. locked? - if (!dn->can_read(req)) { - dout(10) << " waiting on " << *dn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, - name, - new C_MDS_RetryRequest(mds, req, diri)); - return; + // rmdir or unlink? + bool rmdir = false; + if (req->get_op() == MDS_OP_RMDIR) rmdir = true; + + if (rmdir) { + dout(7) << "handle_client_rmdir on " << *dn << endl; + } else { + dout(7) << "handle_client_unlink on " << *dn << endl; } - // null? - if (dn->is_null()) { - dout(10) << "unlink on null dn " << *dn << endl; - reply_request(req, -ENOENT); + // readable? + if (!dn->lock.can_rdlock(mdr)) { + dout(10) << "waiting on unreadable dentry " << *dn << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return; } - // ok! - CInode *in = dn->inode; - assert(in); - if (rmdir) { - dout(7) << "handle_client_rmdir on dir " << *in << endl; - } else { - dout(7) << "handle_client_unlink on non-dir " << *in << endl; - } + // dn looks ok. - // dir stuff + // get/open inode. + mdr->trace.swap(trace); + CInode *in = mdcache->get_dentry_inode(dn, mdr); + if (!in) return; + dout(7) << "dn links to " << *in << endl; + + // rmdir vs is_dir if (in->is_dir()) { if (rmdir) { - // rmdir - - // open dir? - if (in->is_auth() && !in->dir) { - if (!try_open_dir(in, req)) return; - } - - // not dir auth? (or not open, which implies the same!) - if (!in->dir) { - dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return; - } - if (!in->dir->is_auth()) { - int dirauth = in->dir->authority(); - dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - assert(in->dir); - assert(in->dir->is_auth()); - - // dir size check on dir auth (but not necessarily dentry auth)? - - // should be empty - if (in->dir->get_size() == 0 && !in->dir->is_complete()) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl; - mds->mdstore->fetch_dir(in->dir, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - if (in->dir->get_size() > 0) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl; - reply_request(req, -ENOTEMPTY); - return; - } - - dout(7) << "handle_client_rmdir dir is empty!" << endl; - - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; - if (!in->dir->is_freezing() && in->dir->is_frozen()) { - assert(in->dir->is_import()); - mdcache->migrator->export_empty_import(in->dir); - } else { - dout(7) << "apparently already exporting" << endl; - } - in->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - + // do empty directory checks + if (!_verify_rmdir(mdr, in)) + return; } else { - // unlink dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; - reply_request(req, -EISDIR); + reply_request(mdr, -EISDIR); return; } } else { if (rmdir) { // unlink dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; - reply_request(req, -ENOTDIR); + reply_request(mdr, -ENOTDIR); return; } } - // am i dentry auth? - if (dnauth != mds->get_nodeid()) { - // not auth; forward! - dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl; - mdcache->request_forward(req, dnauth); + // lock + set rdlocks, wrlocks, xlocks; + + for (unsigned i=0; ilock); + xlocks.insert(&dn->lock); + wrlocks.insert(&dn->dir->inode->dirlock); + xlocks.insert(&in->linklock); + + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; + + // ok! + if (dn->is_remote() && !dn->inode->is_auth()) + _unlink_remote(mdr, dn); + else + _unlink_local(mdr, dn); +} + + + +class C_MDS_unlink_local_finish : public Context { + MDS *mds; + MDRequest *mdr; + CDentry *dn; + CDentry *straydn; + version_t ipv; // referred inode + utime_t ictime; + version_t dpv; // deleted dentry +public: + C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, + version_t v, utime_t ct) : + mds(m), mdr(r), dn(d), straydn(sd), + ipv(v), ictime(ct), + dpv(d->get_projected_version()) { } + void finish(int r) { + assert(r == 0); + mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dpv); } - - dout(7) << "handle_client_unlink/rmdir on " << *in << endl; +}; + + +void Server::_unlink_local(MDRequest *mdr, CDentry *dn) +{ + dout(10) << "_unlink_local " << *dn << endl; + + // get stray dn ready? + CDentry *straydn = 0; + if (dn->is_primary()) { + string straydname; + dn->inode->name_stray_dentry(straydname); + frag_t fg = mdcache->get_stray()->pick_dirfrag(straydname); + CDir *straydir = mdcache->get_stray()->get_or_open_dirfrag(mdcache, fg); + straydn = straydir->add_dentry(straydname, 0); + dout(10) << "_unlink_local straydn is " << *straydn << endl; + } + - // xlock dentry - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; + // ok, let's do it. + // prepare log entry + EUpdate *le = new EUpdate("unlink_local"); + le->metablob.add_client_req(mdr->reqid); + + version_t ipv = 0; // dirty inode version + inode_t *pi = 0; // the inode + + if (dn->is_primary()) { + // primary link. add stray dentry. + assert(straydn); + ipv = straydn->pre_dirty(dn->inode->inode.version); + le->metablob.add_dir_context(straydn->dir); + pi = le->metablob.add_primary_dentry(straydn, true, dn->inode); + } else { + // remote link. update remote inode. + ipv = dn->inode->pre_dirty(); + le->metablob.add_dir_context(dn->inode->get_parent_dir()); + pi = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); // update primary + } + + // the unlinked dentry + dn->pre_dirty(); + le->metablob.add_dir_context(dn->get_dir()); + le->metablob.add_null_dentry(dn, true); + + // update journaled target inode + pi->nlink--; + pi->ctime = g_clock.real_now(); + pi->version = ipv; + + // finisher + C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, + ipv, pi->ctime); + + journal_opens(); // journal pending opens, just in case + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); + + mds->balancer->hit_dir(dn->dir, META_POP_DWR); +} - // is this a remote link? - if (dn->is_remote() && !dn->inode) { - CInode *in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - // open inode - dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl; - mdcache->open_remote_ino(dn->get_remote_ino(), req, - new C_MDS_RetryRequest(mds, req, diri)); - return; +void Server::_unlink_local_finish(MDRequest *mdr, + CDentry *dn, CDentry *straydn, + version_t ipv, utime_t ictime, version_t dpv) +{ + dout(10) << "_unlink_local " << *dn << endl; + + // unlink main dentry + CInode *in = dn->inode; + dn->dir->unlink_inode(dn); + + // relink as stray? (i.e. was primary link?) + if (straydn) straydn->dir->link_inode(straydn, in); + + // nlink-- + in->inode.ctime = ictime; + in->inode.nlink--; + in->mark_dirty(ipv); // dirty inode + dn->mark_dirty(dpv); // dirty old dentry + + // dir inode's mtime + dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, + ictime); + + // share unlink news with replicas + for (map::iterator it = dn->replicas_begin(); + it != dn->replicas_end(); + it++) { + dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << endl; + MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); + if (straydn) { + unlink->strayin = straydn->dir->inode->replicate_to(it->first); + unlink->straydir = straydn->dir->replicate_to(it->first); + unlink->straydn = straydn->replicate_to(it->first); } + mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); } - + // bump target popularity mds->balancer->hit_dir(dn->dir, META_POP_DWR); - // it's locked, unlink! - MClientReply *reply = new MClientReply(req,0); - mdcache->dentry_unlink(dn, - new C_MDS_CommitRequest(this, req, reply, diri, - new EString("unlink fixme"))); - return; + // reply + MClientReply *reply = new MClientReply(mdr->client_request(), 0); + reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref + + if (straydn) + mdcache->eval_stray(straydn); } +void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) +{ + // IMPLEMENT ME + reply_request(mdr, -EXDEV); +} + + + + +/** _verify_rmdir + * + * verify that a directory is empty (i.e. we can rmdir it), + * and make sure it is part of the same subtree (i.e. local) + * so that rmdir will occur locally. + * + * @param in is the inode being rmdir'd. + */ +bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) +{ + dout(10) << "_verify_rmdir " << *in << endl; + assert(in->is_auth()); + + list frags; + in->dirfragtree.get_leaves(frags); + + for (list::iterator p = frags.begin(); + p != frags.end(); + ++p) { + CDir *dir = in->get_dirfrag(*p); + if (!dir) + dir = in->get_or_open_dirfrag(mdcache, *p); + assert(dir); + + // dir looks empty but incomplete? + if (dir->is_auth() && + dir->get_size() == 0 && + !dir->is_complete()) { + dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + // does the frag _look_ empty? + if (dir->get_size()) { + dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << endl; + reply_request(mdr, -ENOTEMPTY); + return false; + } + + // not dir auth? + if (!dir->is_auth()) { + dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << endl; + reply_request(mdr, -ENOTEMPTY); + return false; + } + } + + return true; +} +/* + // export sanity check + if (!in->is_auth()) { + // i should be exporting this now/soon, since the dir is empty. + dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; + mdcache->migrator->export_empty_import(in->dir); + in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); + return; + } +*/ + + @@ -1697,9 +1938,8 @@ void Server::handle_client_unlink(MClientRequest *req, class C_MDS_RenameTraverseDst : public Context { Server *server; - MClientRequest *req; - CInode *ref; - CInode *srcdiri; + MDRequest *mdr; + CInode *srci; CDir *srcdir; CDentry *srcdn; filepath destpath; @@ -1707,350 +1947,507 @@ public: vector trace; C_MDS_RenameTraverseDst(Server *server, - MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, + MDRequest *r, CDentry *srcdn, filepath& destpath) { this->server = server; - this->req = req; - this->ref = ref; - this->srcdiri = srcdiri; - this->srcdir = srcdir; + this->mdr = r; this->srcdn = srcdn; this->destpath = destpath; } void finish(int r) { - server->handle_client_rename_2(req, ref, - srcdiri, srcdir, srcdn, destpath, + server->handle_client_rename_2(mdr, + srcdn, destpath, trace, r); } }; -/* - +/** handle_client_rename + * + * NOTE: caller did not path_pin the ref (srcdir) inode, as it normally does. + * + weirdness iwith rename: - - ref inode is what was originally srcdiri, but that may change by the tiem + - ref inode is what was originally srcdiri, but that may change by the time the rename actually happens. for all practical purpose, ref is useless except for C_MDS_RetryRequest */ -void Server::handle_client_rename(MClientRequest *req, - CInode *ref) + +bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr) { - dout(7) << "handle_client_rename on " << *req << endl; + // xlocked? + if (dn && !dn->lock.can_rdlock(mdr)) { + dout(10) << "_rename_open_dn waiting on " << *dn << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + if (mustexist && + ((dn && dn->is_null()) || + (!dn && dir->is_complete()))) { + dout(10) << "_rename_open_dn dn dne in " << *dir << endl; + reply_request(mdr, -ENOENT); + return false; + } + + if (!dn && !dir->is_complete()) { + dout(10) << "_rename_open_dn readding incomplete dir" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + assert(dn && !dn->is_null()); + + dout(10) << "_rename_open_dn dn is " << *dn << endl; + CInode *in = mdcache->get_dentry_inode(dn, mdr); + if (!in) return false; + dout(10) << "_rename_open_dn inode is " << *in << endl; + + return true; +} - // sanity checks - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rename root" << endl; - reply_request(req, -EINVAL); +void Server::handle_client_rename(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); + dout(7) << "handle_client_rename " << *req << endl; + + // traverse to dest dir (not dest) + // we do this FIRST, because the rename should occur on the + // destdn's auth. + const filepath &destpath = req->get_sarg(); + const string &destname = destpath.last_dentry(); + vector desttrace; + CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); + if (!destdir) return; // fw or error out + dout(10) << "dest will be " << destname << " in " << *destdir << endl; + assert(destdir->is_auth()); + + // traverse to src + filepath srcpath = req->get_filepath(); + vector srctrace; + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + int r = mdcache->path_traverse(mdr, 0, + srcpath, srctrace, false, + req, ondelay, + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; + if (srctrace.empty()) r = -EINVAL; // can't rename root + if (r < 0) { + reply_request(mdr, r); return; } - // mv a/b a/b/c -- meaningless - if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 && - req->get_sarg().c_str()[ req->get_path().length() ] == '/') { - dout(7) << "can't rename to underneath myself" << endl; - reply_request(req, -EINVAL); + CDentry *srcdn = srctrace[srctrace.size()-1]; + dout(10) << "srcdn is " << *srcdn << endl; + CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); + dout(10) << "srci is " << *srci << endl; + + // -- some sanity checks -- + // src == dest? + if (srcdn->get_dir() == destdir && srcdn->name == destname) { + dout(7) << "rename src=dest, noop" << endl; + reply_request(mdr, 0); return; } - // mv blah blah -- also meaningless - if (req->get_sarg() == req->get_path()) { - dout(7) << "can't rename something to itself (or into itself)" << endl; - reply_request(req, -EINVAL); + // dest a child of src? + // e.g. mv /usr /usr/foo + CDentry *pdn = destdir->inode->parent; + while (pdn) { + if (pdn == srcdn) { + dout(7) << "cannot rename item to be a child of itself" << endl; + reply_request(mdr, -EINVAL); + return; + } + pdn = pdn->dir->inode->parent; + } + + + // identify/create dest dentry + CDentry *destdn = destdir->lookup(destname); + if (destdn && !destdn->lock.can_rdlock(mdr)) { + destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return; } + + CInode *oldin = 0; + if (destdn && !destdn->is_null()) { + dout(10) << "dest dn exists " << *destdn << endl; + oldin = mdcache->get_dentry_inode(destdn, mdr); + if (!oldin) return; + dout(10) << "oldin " << *oldin << endl; + + // mv /some/thing /to/some/existing_other_thing + if (oldin->is_dir() && !srci->is_dir()) { + reply_request(mdr, -EISDIR); + return; + } + if (!oldin->is_dir() && srci->is_dir()) { + reply_request(mdr, -ENOTDIR); + return; + } + + // non-empty dir? + if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) + return; + } + if (!destdn) { + // mv /some/thing /to/some/non_existent_name + destdn = prepare_null_dentry(mdr, destdir, destname); + if (!destdn) return; + } + + dout(10) << "destdn " << *destdn << endl; + + + // -- locks -- + set rdlocks, wrlocks, xlocks; + + // rdlock sourcedir path, xlock src dentry + for (unsigned i=0; ilock); + xlocks.insert(&srcdn->lock); + wrlocks.insert(&srcdn->dir->inode->dirlock); + + // rdlock destdir path, xlock dest dentry + for (unsigned i=0; ilock); + xlocks.insert(&destdn->lock); + wrlocks.insert(&destdn->dir->inode->dirlock); + + // xlock oldin + if (oldin) xlocks.insert(&oldin->linklock); - // traverse to source - /* - this is abnoraml, just for rename. since we don't pin source path - (because we don't want to screw up the lock ordering) the ref inode - (normally/initially srcdiri) may move, and this may fail. - -> so, re-traverse path. and make sure we request_finish in the case of a forward! - */ - filepath refpath = req->get_filepath(); - string srcname = refpath.last_bit(); - refpath = refpath.prefixpath(refpath.depth()-1); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; - dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl; - vector trace; - int r = mdcache->path_traverse(refpath, trace, true, - req, new C_MDS_RetryRequest(mds, req, ref), - MDS_TRAVERSE_FORWARD); - if (r == 2) { - dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl; - dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl; - mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that) + + // ok go! + if (srcdn->is_auth() && destdn->is_auth()) + _rename_local(mdr, srcdn, destdn); + else { + // _rename_remote(mdr, srcdn, destdn); + reply_request(mdr, -EXDEV); return; } - if (r > 0) return; - if (r < 0) { // dne or something. got renamed out from under us, probably! - dout(7) << "traverse r=" << r << endl; - reply_request(req, r); - return; +} + + + + +class C_MDS_rename_local_finish : public Context { + MDS *mds; + MDRequest *mdr; + CDentry *srcdn; + CDentry *destdn; + CDentry *straydn; + version_t ipv; + version_t straypv; + version_t destpv; + version_t srcpv; + utime_t ictime; +public: + version_t atid1; + version_t atid2; + C_MDS_rename_local_finish(MDS *m, MDRequest *r, + CDentry *sdn, CDentry *ddn, CDentry *stdn, + version_t v, utime_t ct) : + mds(m), mdr(r), + srcdn(sdn), destdn(ddn), straydn(stdn), + ipv(v), + straypv(straydn ? straydn->get_projected_version():0), + destpv(destdn->get_projected_version()), + srcpv(srcdn->get_projected_version()), + ictime(ct), + atid1(0), atid2(0) { } + void finish(int r) { + assert(r == 0); + mds->server->_rename_local_finish(mdr, srcdn, destdn, straydn, + srcpv, destpv, straypv, ipv, ictime, + atid1, atid2); } +}; + +class C_MDS_rename_local_anchor : public Context { + Server *server; +public: + LogEvent *le; + C_MDS_rename_local_finish *fin; + version_t atid1; + version_t atid2; - CInode *srcdiri; - if (trace.size()) - srcdiri = trace[trace.size()-1]->inode; - else - srcdiri = mdcache->get_root(); + C_MDS_rename_local_anchor(Server *s) : server(s), le(0), fin(0), atid1(0), atid2(0) { } + void finish(int r) { + server->_rename_local_reanchored(le, fin, atid1, atid2); + } +}; + +void Server::_rename_local(MDRequest *mdr, + CDentry *srcdn, + CDentry *destdn) +{ + dout(10) << "_rename_local " << *srcdn << " to " << *destdn << endl; + + // let's go. + EUpdate *le = new EUpdate("rename_local"); + le->metablob.add_client_req(mdr->reqid); - dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl; + CDentry *straydn = 0; + inode_t *pi = 0; + version_t ipv = 0; + + C_MDS_rename_local_anchor *anchorfin = 0; + C_Gather *anchorgather = 0; + + // primary+remote link merge? + bool linkmerge = (srcdn->inode == destdn->inode && + (srcdn->is_primary() || destdn->is_primary())); + if (linkmerge) { + dout(10) << "will merge remote+primary links" << endl; + + // destdn -> primary + le->metablob.add_dir_context(destdn->dir); + ipv = destdn->pre_dirty(destdn->inode->inode.version); + pi = le->metablob.add_primary_dentry(destdn, true, destdn->inode); + + // do src dentry + le->metablob.add_dir_context(srcdn->dir); + srcdn->pre_dirty(); + le->metablob.add_null_dentry(srcdn, true); + + // anchor update? + if (srcdn->is_primary() && srcdn->inode->is_anchored() && + srcdn->dir != destdn->dir) { + dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; + vector trace; + destdn->make_anchor_trace(trace, srcdn->inode); + anchorfin = new C_MDS_rename_local_anchor(this); + mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &anchorfin->atid1, anchorfin); + } + + } else { + // move to stray? + if (destdn->is_primary()) { + // primary. + // move inode to stray dir. + string straydname; + destdn->inode->name_stray_dentry(straydname); + frag_t fg = mdcache->get_stray()->pick_dirfrag(straydname); + CDir *straydir = mdcache->get_stray()->get_or_open_dirfrag(mdcache, fg); + straydn = straydir->add_dentry(straydname, 0); + dout(10) << "straydn is " << *straydn << endl; + + // renanchor? + if (destdn->inode->is_anchored()) { + dout(10) << "reanchoring dst->stray " << *destdn->inode << endl; + vector trace; + straydn->make_anchor_trace(trace, destdn->inode); + anchorfin = new C_MDS_rename_local_anchor(this); + anchorgather = new C_Gather(anchorfin); + mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &anchorfin->atid1, + anchorgather->new_sub()); + } - dout(7) << "handle_client_rename srcname is " << srcname << endl; + // link-- inode, move to stray dir. + le->metablob.add_dir_context(straydn->dir); + ipv = straydn->pre_dirty(destdn->inode->inode.version); + pi = le->metablob.add_primary_dentry(straydn, true, destdn->inode); + } + else if (destdn->is_remote()) { + // remote. + // nlink-- targeti + le->metablob.add_dir_context(destdn->inode->get_parent_dir()); + ipv = destdn->inode->pre_dirty(); + pi = le->metablob.add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary + dout(10) << "remote targeti (nlink--) is " << *destdn->inode << endl; + } + else { + assert(destdn->is_null()); + } - // make sure parent is a dir? - if (!srcdiri->is_dir()) { - dout(7) << "srcdiri not a dir " << *srcdiri << endl; - reply_request(req, -EINVAL); - return; + // add dest dentry + le->metablob.add_dir_context(destdn->dir); + if (srcdn->is_primary()) { + dout(10) << "src is a primary dentry" << endl; + destdn->pre_dirty(srcdn->inode->inode.version); + le->metablob.add_primary_dentry(destdn, true, srcdn->inode); + + if (srcdn->inode->is_anchored()) { + dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; + vector trace; + destdn->make_anchor_trace(trace, srcdn->inode); + if (!anchorfin) anchorfin = new C_MDS_rename_local_anchor(this); + if (!anchorgather) anchorgather = new C_Gather(anchorfin); + mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &anchorfin->atid2, + anchorgather->new_sub()); + + } + } else { + assert(srcdn->is_remote()); + dout(10) << "src is a remote dentry" << endl; + destdn->pre_dirty(); + le->metablob.add_remote_dentry(destdn, true, srcdn->get_remote_ino()); + } + + // remove src dentry + le->metablob.add_dir_context(srcdn->dir); + srcdn->pre_dirty(); + le->metablob.add_null_dentry(srcdn, true); } - // am i not open, not auth? - if (!srcdiri->dir && !srcdiri->is_auth()) { - int dirauth = srcdiri->authority(); - dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(srcdiri, req)) return; - CDir *srcdir = srcdiri->dir; - dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; - - // make sure it's my dentry - int srcauth = srcdir->dentry_authority(srcname); - if (srcauth != mds->get_nodeid()) { - // fw - dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl; - mdcache->request_forward(req, srcauth); - return; + if (pi) { + // update journaled target inode + pi->nlink--; + pi->ctime = g_clock.real_now(); + pi->version = ipv; } - // ok, done passing buck. - // src dentry - CDentry *srcdn = srcdir->lookup(srcname); + C_MDS_rename_local_finish *fin = new C_MDS_rename_local_finish(mds, mdr, + srcdn, destdn, straydn, + ipv, pi ? pi->ctime:utime_t()); - // xlocked? - if (srcdn && !srcdn->can_read(req)) { - dout(10) << " waiting on " << *srcdn << endl; - srcdir->add_waiter(CDIR_WAIT_DNREAD, - srcname, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; - } - - if ((srcdn && !srcdn->inode) || - (!srcdn && srcdir->is_complete())) { - dout(10) << "handle_client_rename src dne " << endl; - reply_request(req, -EEXIST); - return; - } + journal_opens(); // journal pending opens, just in case - if (!srcdn && !srcdir->is_complete()) { - dout(10) << "readding incomplete dir" << endl; - mds->mdstore->fetch_dir(srcdir, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; + if (anchorfin) { + // doing anchor update prepare first + anchorfin->fin = fin; + anchorfin->le = le; + } else { + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } - assert(srcdn && srcdn->inode); - +} - dout(10) << "handle_client_rename srcdn is " << *srcdn << endl; - dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl; - // pin src in cache (so it won't expire) - mdcache->request_pin_inode(req, srcdn->inode); +void Server::_rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *fin, + version_t atid1, version_t atid2) +{ + dout(10) << "_rename_local_reanchored, logging " << *le << endl; - // find the destination, normalize - // discover, etc. on the way... just get it on the local node. - filepath destpath = req->get_sarg(); + // note anchor transaction ids + fin->atid1 = atid1; + fin->atid2 = atid2; - C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); - - /* - * use DISCOVERXLOCK mode: - * the dest may not exist, and may be xlocked from a remote host - * we want to succeed if we find the xlocked dentry - * ?? - */ - mdcache->path_traverse(destpath, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } -void Server::handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r) + +void Server::_rename_local_finish(MDRequest *mdr, + CDentry *srcdn, CDentry *destdn, CDentry *straydn, + version_t srcpv, version_t destpv, version_t straypv, version_t ipv, + utime_t ictime, + version_t atid1, version_t atid2) { - dout(7) << "handle_client_rename_2 on " << *req << endl; - dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl; + MClientRequest *req = mdr->client_request(); + dout(10) << "_rename_local_finish " << *req << endl; - CInode *srci = srcdn->inode; - assert(srci); - CDir* destdir = 0; - string destname; - - // what is the dest? (dir or file or complete filename) - // note: trace includes root, destpath doesn't (include leading /) - if (trace.size() && trace[trace.size()-1]->inode == 0) { - dout(10) << "dropping null dentry from tail of trace" << endl; - trace.pop_back(); // drop it! - } + CInode *oldin = destdn->inode; - CInode *d; - if (trace.size()) - d = trace[trace.size()-1]->inode; - else - d = mdcache->get_root(); - assert(d); - dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl; - - // make sure i can open the dir? - if (d->is_dir() && !d->dir_is_auth() && !d->dir) { - // discover it - mdcache->open_remote_dir(d, - new C_MDS_RetryRequest(mds, req, ref)); - return; - } - - if (trace.size() == destpath.depth()) { - if (d->is_dir()) { - // mv /some/thing /to/some/dir - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some/dir - destname = req->get_filepath().last_bit(); // thing - destpath.add_dentry(destname); + // primary+remote link merge? + bool linkmerge = (srcdn->inode == destdn->inode && + (srcdn->is_primary() || destdn->is_primary())); + + if (linkmerge) { + assert(ipv); + if (destdn->is_primary()) { + dout(10) << "merging remote onto primary link" << endl; + + // nlink-- in place + destdn->inode->inode.nlink--; + destdn->inode->inode.ctime = ictime; + destdn->inode->mark_dirty(destpv); + + // unlink srcdn + srcdn->dir->unlink_inode(srcdn); + srcdn->mark_dirty(srcpv); } else { - // mv /some/thing /to/some/existing_filename - destdir = trace[trace.size()-1]->dir; // /to/some - destname = destpath.last_bit(); // existing_filename - } - } - else if (trace.size() == destpath.depth()-1) { - if (d->is_dir()) { - // mv /some/thing /to/some/place_that_maybe_dne (we might be replica) - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some - destname = destpath.last_bit(); // place_that_MAYBE_dne - } else { - dout(7) << "dest dne" << endl; - reply_request(req, -EINVAL); - return; + dout(10) << "merging primary onto remote link" << endl; + assert(srcdn->is_primary()); + + // move inode to dest + srcdn->dir->unlink_inode(srcdn); + destdn->dir->unlink_inode(destdn); + destdn->dir->link_inode(destdn, oldin); + + // nlink-- + destdn->inode->inode.nlink--; + destdn->inode->inode.ctime = ictime; + destdn->inode->mark_dirty(destpv); + + // mark src dirty + srcdn->mark_dirty(srcpv); } - } + } else { - assert(trace.size() < destpath.depth()-1); - // check traverse return value - if (r > 0) { - return; // discover, readdir, etc. + // unlink destdn? + if (!destdn->is_null()) + destdn->dir->unlink_inode(destdn); + + if (straydn) { + // relink oldin to stray dir + assert(oldin); + straydn->dir->link_inode(straydn, oldin); + assert(straypv == ipv); } - - // ?? - assert(r < 0 || trace.size() == 0); // musta been an error - - // error out - dout(7) << " rename dest " << destpath << " dne" << endl; - reply_request(req, -EINVAL); - return; - } - - string srcpath = req->get_path(); - dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl; - dout(10) << "handle_client_rename_2 destpath " << destpath << endl; - - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, same file " << endl; - reply_request(req, -EINVAL); - return; - } - - // does destination exist? (is this an overwrite?) - CDentry *destdn = destdir->lookup(destname); - CInode *oldin = 0; - if (destdn) { - oldin = destdn->get_inode(); if (oldin) { - // make sure it's also a file! - // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir. - if (oldin->is_dir()) { - // fail! - dout(7) << "dest exists and is dir" << endl; - reply_request(req, -EISDIR); - return; - } - - if (srcdn->inode->is_dir() && - !oldin->is_dir()) { - dout(7) << "cannot overwrite non-directory with directory" << endl; - reply_request(req, -EISDIR); - return; - } + // nlink-- + oldin->inode.nlink--; + oldin->inode.ctime = ictime; + oldin->mark_dirty(ipv); } - - dout(7) << "dest exists " << *destdn << endl; - if (destdn->get_inode()) { - dout(7) << "destino is " << *destdn->get_inode() << endl; + + CInode *in = srcdn->inode; + assert(in); + if (srcdn->is_remote()) { + srcdn->dir->unlink_inode(srcdn); + destdn->dir->link_inode(destdn, in->ino()); } else { - dout(7) << "dest dn is a NULL stub" << endl; + srcdn->dir->unlink_inode(srcdn); + destdn->dir->link_inode(destdn, in); } - } else { - dout(7) << "dest dn dne (yet)" << endl; + destdn->mark_dirty(destpv); + srcdn->mark_dirty(srcpv); } - - // local or remote? - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); - dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl; - - // - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "rename has remote dest " << destauth << endl; - dout(7) << "FOREIGN RENAME" << endl; - - // punt? - if (false && srcdn->inode->is_dir()) { - reply_request(req, -EINVAL); - return; - } + // commit anchor updates? + if (atid1) mds->anchorclient->commit(atid1); + if (atid2) mds->anchorclient->commit(atid2); - } else { - dout(7) << "rename is local" << endl; - } + // update subtree map? + if (destdn->inode->is_dir()) + mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); - handle_client_rename_local(req, ref, - srcpath, srcdiri, srcdn, - destpath.get_path(), destdir, destdn, destname); - return; + // share news with replicas + // *** + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref + + // clean up? + if (straydn) + mdcache->eval_stray(straydn); } +/* void Server::handle_client_rename_local(MClientRequest *req, - CInode *ref, - string& srcpath, - CInode *srcdiri, - CDentry *srcdn, - string& destpath, - CDir *destdir, - CDentry *destdn, - string& destname) + CInode *ref, + const string& srcpath, + CInode *srcdiri, + CDentry *srcdn, + const string& destpath, + CDir *destdir, + CDentry *destdn, + const string& destname) { +*/ //bool everybody = false; //if (true || srcdn->inode->is_dir()) { /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! @@ -2093,9 +2490,9 @@ void Server::handle_client_rename_local(MClientRequest *req, //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl; //everybody = true; //} - - bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == mds->get_nodeid(); - bool destlocal = destdir->dentry_authority(destname) == mds->get_nodeid(); +/* + bool srclocal = srcdn->dir->dentry_authority(srcdn->name).first == mds->get_nodeid(); + bool destlocal = destdir->dentry_authority(destname).first == mds->get_nodeid(); dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl; if (destdn) { @@ -2104,8 +2501,7 @@ void Server::handle_client_rename_local(MClientRequest *req, dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl; } - /* lock source and dest dentries, in lexicographic order. - */ + // lock source and dest dentries, in lexicographic order. bool dosrc = srcpath < destpath; for (int i=0; i<2; i++) { if (dosrc) { @@ -2135,9 +2531,8 @@ void Server::handle_client_rename_local(MClientRequest *req, } } else { if (!destdn || destdn->xlockedby != req) { - /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case - * my traverse and determination of dest vs dest/srcfilename was out of date. - */ + // NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case + // my traverse and determination of dest vs dest/srcfilename was out of date. mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref)); return; } @@ -2194,7 +2589,7 @@ void Server::handle_client_rename_local(MClientRequest *req, - +*/ @@ -2205,90 +2600,159 @@ void Server::handle_client_rename_local(MClientRequest *req, // =================================== // TRUNCATE, FSYNC -/* - * FIXME: this truncate implemention is WRONG WRONG WRONG - */ +class C_MDS_truncate_purged : public Context { + MDS *mds; + MDRequest *mdr; + CInode *in; + version_t pv; + off_t size; + utime_t ctime; +public: + C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : + mds(m), mdr(r), in(i), + pv(pdv), + size(sz), ctime(ct) { } + void finish(int r) { + assert(r == 0); -void Server::handle_client_truncate(MClientRequest *req, CInode *cur) -{ - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) lock + // apply to cache + in->inode.size = size; + in->inode.ctime = ctime; + in->inode.mtime = ctime; + in->mark_dirty(pv); - // check permissions - - // do update - cur->inode.size = req->get_sizearg(); - cur->_mark_dirty(); // fixme + // hit pop + mds->balancer->hit_inode(in, META_POP_IWR); - mds->locker->inode_file_write_finish(cur); + // reply + mds->server->reply_request(mdr, 0); + } +}; - mds->balancer->hit_inode(cur, META_POP_IWR); +class C_MDS_truncate_logged : public Context { + MDS *mds; + MDRequest *mdr; + CInode *in; + version_t pv; + off_t size; + utime_t ctime; +public: + C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : + mds(m), mdr(r), in(i), + pv(pdv), + size(sz), ctime(ct) { } + void finish(int r) { + assert(r == 0); - // start reply - MClientReply *reply = new MClientReply(req, 0); + // purge + mds->mdcache->purge_inode(&in->inode, size); + mds->mdcache->wait_for_purge(in->inode.ino, size, + new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); + } +}; - // commit - commit_request(req, reply, cur, - new EString("truncate fixme")); -} +void Server::handle_client_truncate(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); + CInode *cur = rdlock_path_pin_ref(mdr, true); + if (!cur) return; + + // check permissions? + + // xlock inode + if (!mds->locker->xlock_start(&cur->filelock, mdr)) + return; // fw or (wait for) lock + + // already small enough? + if (cur->inode.size >= req->args.truncate.length) { + reply_request(mdr, 0); + return; + } + // prepare + version_t pdv = cur->pre_dirty(); + utime_t ctime = g_clock.real_now(); + Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, + pdv, req->args.truncate.length, ctime); + + // log + wait + EUpdate *le = new EUpdate("truncate"); + le->metablob.add_client_req(mdr->reqid); + le->metablob.add_dir_context(cur->get_parent_dir()); + le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length); + inode_t *pi = le->metablob.add_dentry(cur->parent, true); + pi->mtime = ctime; + pi->ctime = ctime; + pi->version = pdv; + pi->size = req->args.truncate.length; + + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); +} // =========================== // open, openc, close -void Server::handle_client_open(MClientRequest *req, - CInode *cur) +void Server::handle_client_open(MDRequest *mdr) { - int flags = req->get_iarg(); - int mode = req->get_iarg2(); - - dout(7) << "open " << flags << " on " << *cur << endl; - dout(10) << "open flags = " << flags << " mode = " << mode << endl; - - // is it a file? - if (!(cur->inode.mode & INODE_MODE_FILE)) { - dout(7) << "not a regular file" << endl; - reply_request(req, -EINVAL); // FIXME what error do we want? + MClientRequest *req = mdr->client_request(); + + int flags = req->args.open.flags; + int cmode = req->get_open_file_mode(); + bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || + (flags & O_TRUNC)); + dout(10) << "open flags = " << flags + << ", filemode = " << cmode + << ", need_auth = " << need_auth + << endl; + + CInode *cur = rdlock_path_pin_ref(mdr, need_auth); + if (!cur) return; + + // regular file? + if ((cur->inode.mode & INODE_TYPE_MASK) != INODE_MODE_FILE) { + dout(7) << "not a regular file " << *cur << endl; + reply_request(mdr, -EINVAL); // FIXME what error do we want? return; } - // auth for write access - if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY && - !cur->is_auth()) { - int auth = cur->authority(); - assert(auth != mds->get_nodeid()); - dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; - - mdcache->request_forward(req, auth); - return; - } + // hmm, check permissions or something. + // O_TRUNC if (flags & O_TRUNC) { - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) lock - - // do update - cur->inode.size = req->get_sizearg(); - cur->_mark_dirty(); // fixme + assert(cur->is_auth()); + + // xlock file size + if (!mds->locker->xlock_start(&cur->filelock, mdr)) + return; - mds->locker->inode_file_write_finish(cur); + if (cur->inode.size > 0) { + handle_client_opent(mdr); + return; + } } + + // do it + _do_open(mdr, cur); +} - - // hmm, check permissions or something. - +void Server::_do_open(MDRequest *mdr, CInode *cur) +{ + MClientRequest *req = mdr->client_request(); + int cmode = req->get_open_file_mode(); // can we issue the caps they want? version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, mode, req); + Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl; - + + dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) + << " for " << req->get_source() + << " on " << *cur << endl; + + // hit pop mds->balancer->hit_inode(cur, META_POP_IRD); // reply @@ -2296,19 +2760,156 @@ void Server::handle_client_open(MClientRequest *req, reply->set_file_caps(cap->pending()); reply->set_file_caps_seq(cap->get_last_seq()); reply->set_file_data_version(fdv); - reply_request(req, reply, cur); + reply_request(mdr, reply, cur); + + // journal? + if (cur->last_open_journaled == 0) { + queue_journal_open(cur); + maybe_journal_opens(); + } + +} + +void Server::queue_journal_open(CInode *in) +{ + dout(10) << "queue_journal_open on " << *in << endl; + + if (journal_open_queue.count(in) == 0) { + // pin so our pointer stays valid + in->get(CInode::PIN_BATCHOPENJOURNAL); + + // queue it up for a bit + journal_open_queue.insert(in); + } +} + + +void Server::journal_opens() +{ + dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << endl; + if (journal_open_queue.empty()) return; + + EOpen *le = 0; + + // check queued inodes + for (set::iterator p = journal_open_queue.begin(); + p != journal_open_queue.end(); + ++p) { + (*p)->put(CInode::PIN_BATCHOPENJOURNAL); + if ((*p)->is_any_caps()) { + if (!le) le = new EOpen; + le->add_inode(*p); + (*p)->last_open_journaled = mds->mdlog->get_write_pos(); + } + } + journal_open_queue.clear(); + + if (le) { + // journal + mds->mdlog->submit_entry(le); + + // add waiters to journal entry + for (list::iterator p = journal_open_waiters.begin(); + p != journal_open_waiters.end(); + ++p) + mds->mdlog->wait_for_sync(*p); + journal_open_waiters.clear(); + } else { + // nothing worth journaling here, just kick the waiters. + mds->queue_waiters(journal_open_waiters); + } +} + + + + +class C_MDS_open_truncate_purged : public Context { + MDS *mds; + MDRequest *mdr; + CInode *in; + version_t pv; + utime_t ctime; +public: + C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : + mds(m), mdr(r), in(i), + pv(pdv), + ctime(ct) { } + void finish(int r) { + assert(r == 0); + + // apply to cache + in->inode.size = 0; + in->inode.ctime = ctime; + in->inode.mtime = ctime; + in->mark_dirty(pv); + + // hit pop + mds->balancer->hit_inode(in, META_POP_IWR); + + // do the open + mds->server->_do_open(mdr, in); + } +}; + +class C_MDS_open_truncate_logged : public Context { + MDS *mds; + MDRequest *mdr; + CInode *in; + version_t pv; + utime_t ctime; +public: + C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : + mds(m), mdr(r), in(i), + pv(pdv), + ctime(ct) { } + void finish(int r) { + assert(r == 0); + + // purge also... + mds->mdcache->purge_inode(&in->inode, 0); + mds->mdcache->wait_for_purge(in->inode.ino, 0, + new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); + } +}; + + +void Server::handle_client_opent(MDRequest *mdr) +{ + CInode *cur = mdr->ref; + assert(cur); + + // prepare + version_t pdv = cur->pre_dirty(); + utime_t ctime = g_clock.real_now(); + Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, + pdv, ctime); + + // log + wait + EUpdate *le = new EUpdate("open_truncate"); + le->metablob.add_client_req(mdr->reqid); + le->metablob.add_dir_context(cur->get_parent_dir()); + le->metablob.add_inode_truncate(cur->inode, 0); + inode_t *pi = le->metablob.add_dentry(cur->parent, true); + pi->mtime = ctime; + pi->ctime = ctime; + pi->version = pdv; + pi->size = 0; + + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } + class C_MDS_openc_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *dn; CInode *newi; version_t pv; public: - C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), + C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : + mds(m), mdr(r), dn(d), newi(ni), pv(d->get_projected_version()) {} void finish(int r) { assert(r == 0); @@ -2319,59 +2920,72 @@ public: // dirty inode, dn, dir newi->mark_dirty(pv); - // unlock - mds->locker->dentry_xlock_finish(dn); + // downgrade xlock to rdlock + //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); + // set/pin ref inode for open() + mdr->ref = newi; + mdr->pin(newi); + // hit pop mds->balancer->hit_inode(newi, META_POP_IWR); // ok, do the open. - mds->server->handle_client_open(req, newi); + mds->server->handle_client_open(mdr); } }; -void Server::handle_client_openc(MClientRequest *req, CInode *diri) +void Server::handle_client_openc(MDRequest *mdr) { - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; + MClientRequest *req = mdr->client_request(); - CInode *in = 0; - CDentry *dn = 0; + dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - // make dentry and inode, xlock dentry. - bool excl = req->get_iarg() & O_EXCL; - int r = prepare_mknod(req, diri, &in, &dn, !excl); - if (!r) - return; // wait on something - assert(in); - assert(dn); - - if (r == 1) { - // created. - // it's a file. - in->inode.mode = 0644; // FIXME req should have a umask - in->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); - EUpdate *le = new EUpdate("openc"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, in); - pi->version = dn->get_projected_version(); + bool excl = (req->args.open.flags & O_EXCL); + CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); + if (!dn) return; + + if (!dn->is_null()) { + // it existed. + if (req->args.open.flags & O_EXCL) { + dout(10) << "O_EXCL, target exists, failing with -EEXIST" << endl; + reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); + return; + } - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ - } else { - // exists! - // FIXME: do i need to repin path based existant inode? hmm. - handle_client_open(req, in); + // pass to regular open handler. + handle_client_open(mdr); + return; } + + // created null dn. + + // create inode. + CInode *in = prepare_new_inode(req, dn->dir); + assert(in); + + // it's a file. + dn->pre_dirty(); + in->inode.mode = req->args.open.mode; + in->inode.mode |= INODE_MODE_FILE; + + // prepare finisher + C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); + EUpdate *le = new EUpdate("openc"); + le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_dir_context(dn->dir); + inode_t *pi = le->metablob.add_primary_dentry(dn, true, in); + pi->version = dn->get_projected_version(); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); + + /* + FIXME. this needs to be rewritten when the write capability stuff starts + getting journaled. + */ } diff --git a/trunk/ceph/mds/Server.h b/trunk/ceph/mds/Server.h index d4509f1418e07..24a7d19d13922 100644 --- a/trunk/ceph/mds/Server.h +++ b/trunk/ceph/mds/Server.h @@ -17,6 +17,8 @@ #include "MDS.h" class LogEvent; +class C_MDS_rename_local_finish; +class MDRequest; class Server { MDS *mds; @@ -24,132 +26,117 @@ class Server { MDLog *mdlog; Messenger *messenger; - __uint64_t stat_ops; - - public: Server(MDS *m) : mds(m), mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - stat_ops(0) { + messenger(mds->messenger) { } + // message handler void dispatch(Message *m); - // generic request helpers - void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); - void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); + + // -- sessions and recovery -- + utime_t reconnect_start; + set client_reconnect_gather; // clients i need a reconnect msg from. + set reconnected_open_files; - void submit_update(MClientRequest *req, CInode *wrlockedi, - LogEvent *event, - Context *oncommit); - - void commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2 = 0); + void handle_client_session(class MClientSession *m); + void _session_logged(entity_inst_t ci, bool open, version_t cmapv); + void reconnect_clients(); + void handle_client_reconnect(class MClientReconnect *m); + void client_reconnect_failure(int from); + void reconnect_finish(); + void terminate_sessions(); - bool try_open_dir(CInode *in, MClientRequest *req); - + // -- requests -- + void handle_client_request(MClientRequest *m); - // clients - void handle_client_mount(class MClientMount *m); - void handle_client_unmount(Message *m); + void dispatch_request(MDRequest *mdr); + void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); + void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); - void handle_client_request(MClientRequest *m); - void handle_client_request_2(MClientRequest *req, - vector& trace, - int r); - - // fs ops - void handle_client_fstat(MClientRequest *req); - - // requests - void dispatch_request(Message *m, CInode *ref); - - // inode request *req, CInode *ref; - void handle_client_stat(MClientRequest *req, CInode *ref); - void handle_client_utime(MClientRequest *req, CInode *ref); - void handle_client_inode_soft_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - void handle_client_chmod(MClientRequest *req, CInode *ref); - void handle_client_chown(MClientRequest *req, CInode *ref); - void handle_client_inode_hard_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - - // readdir - void handle_client_readdir(MClientRequest *req, CInode *ref); - int encode_dir_contents(CDir *dir, - list& inls, - list& dnls); - void handle_hash_readdir(MHashReaddir *m); - void handle_hash_readdir_reply(MHashReaddirReply *m); - void finish_hash_readdir(MClientRequest *req, CDir *dir); + // some helpers + CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); + CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); + CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); + CInode* prepare_new_inode(MClientRequest *req, CDir *dir); + + CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); + CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); + + CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); + //CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); + + // requests on existing inodes. + void handle_client_stat(MDRequest *mdr); + void handle_client_utime(MDRequest *mdr); + void handle_client_chmod(MDRequest *mdr); + void handle_client_chown(MDRequest *mdr); + void handle_client_readdir(MDRequest *mdr); + int encode_dir_contents(CDir *dir, list& inls, list& dnls); + void handle_client_truncate(MDRequest *mdr); + void handle_client_fsync(MDRequest *mdr); + + // open + void handle_client_open(MDRequest *mdr); + void handle_client_openc(MDRequest *mdr); // O_CREAT variant. + void handle_client_opent(MDRequest *mdr); // O_TRUNC variant. + void _do_open(MDRequest *mdr, CInode *ref); + + set journal_open_queue; // to be journal + list journal_open_waiters; + void queue_journal_open(CInode *in); + void add_journal_open_waiter(Context *c) { + journal_open_waiters.push_back(c); + } + void maybe_journal_opens() { + if (journal_open_queue.size() >= (unsigned)g_conf.mds_log_eopen_size) + journal_opens(); + } + void journal_opens(); // namespace changes - void handle_client_mknod(MClientRequest *req, CInode *ref); - void handle_client_link(MClientRequest *req, CInode *ref); - void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); - void handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti); - - void handle_client_unlink(MClientRequest *req, CInode *ref); - void handle_client_rename(MClientRequest *req, CInode *ref); - void handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, + void handle_client_mknod(MDRequest *mdr); + void handle_client_mkdir(MDRequest *mdr); + void handle_client_symlink(MDRequest *mdr); + + // link + void handle_client_link(MDRequest *mdr); + void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); + void _link_local_finish(MDRequest *mdr, + CDentry *dn, CInode *targeti, + version_t, utime_t, version_t); + void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); + + // unlink + void handle_client_unlink(MDRequest *mdr); + bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); + void _unlink_local(MDRequest *mdr, CDentry *dn); + void _unlink_local_finish(MDRequest *mdr, + CDentry *dn, CDentry *straydn, + version_t, utime_t, version_t); + void _unlink_remote(MDRequest *mdr, CDentry *dn); + + // rename + bool _rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr); + void handle_client_rename(MDRequest *mdr); + void handle_client_rename_2(MDRequest *mdr, CDentry *srcdn, filepath& destpath, vector& trace, int r); - void handle_client_rename_local(MClientRequest *req, CInode *ref, - string& srcpath, CInode *srcdiri, CDentry *srcdn, - string& destpath, CDir *destdir, CDentry *destdn, string& name); - - void handle_client_mkdir(MClientRequest *req, CInode *ref); - void handle_client_rmdir(MClientRequest *req, CInode *ref); - void handle_client_symlink(MClientRequest *req, CInode *ref); - - // file - void handle_client_open(MClientRequest *req, CInode *ref); - void handle_client_openc(MClientRequest *req, CInode *ref); - void handle_client_release(MClientRequest *req, CInode *in); - void handle_client_truncate(MClientRequest *req, CInode *in); - void handle_client_fsync(MClientRequest *req, CInode *in); - - - // some helpers - CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc - - CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname); - int prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist=false); - - - + void _rename_local(MDRequest *mdr, CDentry *srcdn, CDentry *destdn); + void _rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *fin, + version_t atid1, version_t atid2); + void _rename_local_finish(MDRequest *mdr, + CDentry *srcdn, CDentry *destdn, CDentry *straydn, + version_t srcpv, version_t destpv, version_t straypv, version_t ipv, + utime_t ictime, + version_t atid1, version_t atid2); }; -class C_MDS_RetryRequest : public Context { - MDS *mds; - Message *req; // MClientRequest or MLock - CInode *ref; - public: - C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) { - assert(ref); - this->mds = mds; - this->req = req; - this->ref = ref; - } - virtual void finish(int r) { - mds->server->dispatch_request(req, ref); - } -}; diff --git a/trunk/ceph/mds/SimpleLock.h b/trunk/ceph/mds/SimpleLock.h new file mode 100644 index 0000000000000..70d65303f9d5b --- /dev/null +++ b/trunk/ceph/mds/SimpleLock.h @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __SIMPLELOCK_H +#define __SIMPLELOCK_H + +// -- lock types -- +// NOTE: this also defines the lock ordering! +#define LOCK_OTYPE_DN 1 + +#define LOCK_OTYPE_IFILE 2 +#define LOCK_OTYPE_IAUTH 3 +#define LOCK_OTYPE_ILINK 4 +#define LOCK_OTYPE_IDIRFRAGTREE 5 +#define LOCK_OTYPE_IDIR 6 + +//#define LOCK_OTYPE_DIR 7 // not used + +inline const char *get_lock_type_name(int t) { + switch (t) { + case LOCK_OTYPE_DN: return "dentry"; + case LOCK_OTYPE_IFILE: return "inode_file"; + case LOCK_OTYPE_IAUTH: return "inode_auth"; + case LOCK_OTYPE_ILINK: return "inode_link"; + case LOCK_OTYPE_IDIRFRAGTREE: return "inode_dirfragtree"; + case LOCK_OTYPE_IDIR: return "inode_dir"; + default: assert(0); + } +} + +// -- lock states -- +#define LOCK_UNDEF 0 +// auth rep +#define LOCK_SYNC 1 // AR R . R . +#define LOCK_LOCK 2 // AR R W . . +#define LOCK_GLOCKR 3 // AR R . . . + +inline const char *get_simplelock_state_name(int n) { + switch (n) { + case LOCK_UNDEF: return "undef"; + case LOCK_SYNC: return "sync"; + case LOCK_LOCK: return "lock"; + case LOCK_GLOCKR: return "glockr"; + default: assert(0); + } +} + +class MDRequest; + +class SimpleLock { +public: + static const int WAIT_RD = (1<<0); // to read + static const int WAIT_WR = (1<<1); // to write + static const int WAIT_NOLOCKS = (1<<2); // for last rdlock to finish + //static const int WAIT_LOCK = (1<<3); // for locked state + static const int WAIT_STABLE = (1<<3); // for a stable state + static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock + static const int WAIT_BITS = 5; + +protected: + // parent (what i lock) + MDSCacheObject *parent; + int type; + int wait_offset; + + // lock state + char state; + set<__int32_t> gather_set; // auth + + // local state + int num_rdlock; + MDRequest *xlock_by; + +public: + SimpleLock(MDSCacheObject *o, int t, int wo) : + parent(o), type(t), wait_offset(wo), + state(LOCK_SYNC), + num_rdlock(0), xlock_by(0) { } + virtual ~SimpleLock() {} + + // parent + MDSCacheObject *get_parent() { return parent; } + int get_type() { return type; } + + struct ptr_lt { + bool operator()(const SimpleLock* l, const SimpleLock* r) const { + if (l->type < r->type) return true; + if (l->type == r->type) return l->parent->is_lt(r->parent); + return false; + } + }; + + void decode_locked_state(bufferlist& bl) { + parent->decode_lock_state(type, bl); + } + void encode_locked_state(bufferlist& bl) { + parent->encode_lock_state(type, bl); + } + void finish_waiters(int mask, int r=0) { + parent->finish_waiting(mask < wait_offset, r); + } + void add_waiter(int mask, Context *c) { + parent->add_waiter(mask < wait_offset, c); + } + bool is_waiter_for(int mask) { + return parent->is_waiter_for(mask < wait_offset); + } + + + + // state + char get_state() { return state; } + char set_state(char s) { + state = s; + assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + return s; + }; + bool is_stable() { + return state >= 0; + } + + + // gather set + const set& get_gather_set() { return gather_set; } + void init_gather() { + for (map::const_iterator p = parent->replicas_begin(); + p != parent->replicas_end(); + ++p) + gather_set.insert(p->first); + } + bool is_gathering() { return !gather_set.empty(); } + bool is_gathering(int i) { + return gather_set.count(i); + } + void clear_gather() { + gather_set.clear(); + } + void remove_gather(int i) { + gather_set.erase(i); + } + + // ref counting + bool is_rdlocked() { return num_rdlock > 0; } + int get_rdlock() { return ++num_rdlock; } + int put_rdlock() { + assert(num_rdlock>0); + return --num_rdlock; + } + int get_num_rdlocks() { return num_rdlock; } + + void get_xlock(MDRequest *who) { + assert(xlock_by == 0); + xlock_by = who; + } + void put_xlock() { + assert(xlock_by); + xlock_by = 0; + } + bool is_xlocked() { return xlock_by ? true:false; } + MDRequest *get_xlocked_by() { return xlock_by; } + bool is_used() { + return is_xlocked() || is_rdlocked(); + } + + // encode/decode + void _encode(bufferlist& bl) { + ::_encode(state, bl); + ::_encode(gather_set, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(state, bl, off); + ::_decode(gather_set, bl, off); + } + + + // simplelock specifics + char get_replica_state() { + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKR: + return LOCK_LOCK; + case LOCK_SYNC: + return LOCK_SYNC; + default: + assert(0); + } + return 0; + } + /** replicate_relax + * called on first replica creation. + */ + void replicate_relax() { + assert(parent->is_auth()); + assert(!parent->is_replicated()); + if (state == LOCK_LOCK && !is_used()) + state = LOCK_SYNC; + } + bool remove_replica(int from) { + if (is_gathering(from)) { + remove_gather(from); + if (!is_gathering()) + return true; + } + return false; + } + bool do_import(int from, int to) { + if (!is_stable()) { + remove_gather(from); + remove_gather(to); + if (!is_gathering()) + return true; + } + return false; + } + + bool can_rdlock(MDRequest *mdr) { + if (state == LOCK_SYNC) + return true; + if (state == LOCK_LOCK && mdr && xlock_by == mdr) + return true; + return false; + } + bool can_xlock(MDRequest *mdr) { + if (!parent->is_auth()) return false; + if (state != LOCK_LOCK) return false; + if (mdr && xlock_by == mdr) return true; + return false; + } + bool can_xlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKR); + else + return false; + } + + virtual void print(ostream& out) { + out << "("; + //out << get_lock_type_name(l.get_type()) << " "; + out << get_simplelock_state_name(get_state()); + if (!get_gather_set().empty()) out << " g=" << get_gather_set(); + if (is_rdlocked()) + out << " r=" << get_num_rdlocks(); + if (is_xlocked()) + out << " w=" << get_xlocked_by(); + out << ")"; + } +}; + +inline ostream& operator<<(ostream& out, SimpleLock& l) +{ + l.print(out); + return out; +} + + +#endif diff --git a/trunk/ceph/mds/events/EAnchor.h b/trunk/ceph/mds/events/EAnchor.h new file mode 100644 index 0000000000000..f65ebfb1f8ff4 --- /dev/null +++ b/trunk/ceph/mds/events/EAnchor.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EANCHOR_H +#define __MDS_EANCHOR_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "../Anchor.h" + +class EAnchor : public LogEvent { +protected: + int op; + inodeno_t ino; + version_t atid; + vector trace; + version_t version; // anchor table version + int reqmds; + + public: + EAnchor() : LogEvent(EVENT_ANCHOR) { } + EAnchor(int o, inodeno_t i, version_t v, int rm) : + LogEvent(EVENT_ANCHOR), + op(o), ino(i), atid(0), version(v), reqmds(rm) { } + EAnchor(int o, version_t a, version_t v) : + LogEvent(EVENT_ANCHOR), + op(o), atid(a), version(v), reqmds(-1) { } + + void set_trace(vector& t) { trace = t; } + vector& get_trace() { return trace; } + + void encode_payload(bufferlist& bl) { + bl.append((char*)&op, sizeof(op)); + bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&atid, sizeof(atid)); + ::_encode(trace, bl); + bl.append((char*)&version, sizeof(version)); + bl.append((char*)&reqmds, sizeof(reqmds)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(op), (char*)&op); + off += sizeof(op); + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + bl.copy(off, sizeof(atid), (char*)&atid); + off += sizeof(atid); + ::_decode(trace, bl, off); + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + bl.copy(off, sizeof(reqmds), (char*)&reqmds); + off += sizeof(reqmds); + } + + void print(ostream& out) { + out << "EAnchor " << get_anchor_opname(op); + if (ino) out << " " << ino; + if (atid) out << " atid " << atid; + if (version) out << " v " << version; + if (reqmds >= 0) out << " by mds" << reqmds; + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/trunk/ceph/mds/events/EAnchorClient.h b/trunk/ceph/mds/events/EAnchorClient.h new file mode 100644 index 0000000000000..111a34152ff3f --- /dev/null +++ b/trunk/ceph/mds/events/EAnchorClient.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EANCHORCLIENT_H +#define __MDS_EANCHORCLIENT_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "../Anchor.h" + +class EAnchorClient : public LogEvent { +protected: + int op; + version_t atid; + + public: + EAnchorClient() : LogEvent(EVENT_ANCHORCLIENT) { } + EAnchorClient(int o, version_t at) : + LogEvent(EVENT_ANCHORCLIENT), + op(o), atid(at) { } + + void encode_payload(bufferlist& bl) { + bl.append((char*)&op, sizeof(op)); + bl.append((char*)&atid, sizeof(atid)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(op), (char*)&op); + off += sizeof(op); + bl.copy(off, sizeof(atid), (char*)&atid); + off += sizeof(atid); + } + + void print(ostream& out) { + out << "EAnchorClient " << get_anchor_opname(op); + if (atid) out << " atid " << atid; + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/trunk/ceph/mds/events/EClientMap.h b/trunk/ceph/mds/events/EClientMap.h new file mode 100644 index 0000000000000..83d4e32b76744 --- /dev/null +++ b/trunk/ceph/mds/events/EClientMap.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_ECLIENTMAP_H +#define __MDS_ECLIENTMAP_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" + +class EClientMap : public LogEvent { + protected: + bufferlist mapbl; + version_t cmapv; // client map version + + public: + EClientMap() : LogEvent(EVENT_CLIENTMAP) { } + EClientMap(bufferlist& bl, version_t v) : + LogEvent(EVENT_CLIENTMAP), + cmapv(v) { + mapbl.claim(bl); + } + + void encode_payload(bufferlist& bl) { + bl.append((char*)&cmapv, sizeof(cmapv)); + ::_encode(mapbl, bl); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(cmapv), (char*)&cmapv); + off += sizeof(cmapv); + ::_decode(mapbl, bl, off); + } + + + void print(ostream& out) { + out << "EClientMap v " << cmapv; + } + + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/trunk/ceph/mds/events/EExportStart.h b/trunk/ceph/mds/events/EExport.h similarity index 57% rename from trunk/ceph/mds/events/EExportStart.h rename to trunk/ceph/mds/events/EExport.h index 37ed92a7239c2..eaa6d8e4bcce7 100644 --- a/trunk/ceph/mds/events/EExportStart.h +++ b/trunk/ceph/mds/events/EExport.h @@ -11,8 +11,8 @@ * */ -#ifndef __EEXPORTSTART_H -#define __EEXPORTSTART_H +#ifndef __EEXPORT_H +#define __EEXPORT_H #include #include "config.h" @@ -22,40 +22,35 @@ #include "EMetaBlob.h" -class EExportStart : public LogEvent { - public: +class EExport : public LogEvent { +public: EMetaBlob metablob; // exported dir - protected: - inodeno_t dirino; - int dest; // dest mds - set bounds; - - public: - EExportStart(CDir *dir, int d) : LogEvent(EVENT_EXPORTSTART), - dirino(dir->ino()), - dest(d) { +protected: + dirfrag_t base; + set bounds; + +public: + EExport(CDir *dir) : LogEvent(EVENT_EXPORT), + base(dir->dirfrag()) { metablob.add_dir_context(dir); } - EExportStart() : LogEvent(EVENT_EXPORTSTART) { } + EExport() : LogEvent(EVENT_EXPORT) { } + + set &get_bounds() { return bounds; } - set &get_bounds() { return bounds; } - void print(ostream& out) { - out << "export_start " << dirino << " -> " << dest; + out << "export " << base << " " << metablob; } virtual void encode_payload(bufferlist& bl) { metablob._encode(bl); - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&dest, sizeof(dest)); + bl.append((char*)&base, sizeof(base)); ::_encode(bounds, bl); } void decode_payload(bufferlist& bl, int& off) { metablob._decode(bl, off); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(dest), (char*)&dest); - off += sizeof(dest); + bl.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); ::_decode(bounds, bl, off); } diff --git a/trunk/ceph/mds/events/EExportFinish.h b/trunk/ceph/mds/events/EExportFinish.h deleted file mode 100644 index 114d580b6a499..0000000000000 --- a/trunk/ceph/mds/events/EExportFinish.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORTFINISH_H -#define __EEXPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EExportFinish : public LogEvent { - protected: - inodeno_t dirino; // exported dir - bool success; - - public: - EExportFinish(CDir *dir, bool s) : LogEvent(EVENT_EXPORTFINISH), - dirino(dir->ino()), - success(s) { } - EExportFinish() : LogEvent(EVENT_EXPORTFINISH) { } - - void print(ostream& out) { - out << "export_finish " << dirino; - if (success) - out << " success"; - else - out << " failure"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/EImportFinish.h b/trunk/ceph/mds/events/EImportFinish.h index 14a9ab6403af6..7d51c038f3fab 100644 --- a/trunk/ceph/mds/events/EImportFinish.h +++ b/trunk/ceph/mds/events/EImportFinish.h @@ -22,17 +22,17 @@ class EImportFinish : public LogEvent { protected: - inodeno_t dirino; // imported dir + dirfrag_t base; // imported dir bool success; public: EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - dirino(dir->ino()), + base(dir->dirfrag()), success(s) { } EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } void print(ostream& out) { - out << "import_finish " << dirino; + out << "import_finish " << base; if (success) out << " success"; else @@ -40,12 +40,12 @@ class EImportFinish : public LogEvent { } virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&base, sizeof(base)); bl.append((char*)&success, sizeof(success)); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); + bl.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); bl.copy(off, sizeof(success), (char*)&success); off += sizeof(success); } diff --git a/trunk/ceph/mds/events/EImportMap.h b/trunk/ceph/mds/events/EImportMap.h index 50f366faaa9fa..2bfaa0d2a21aa 100644 --- a/trunk/ceph/mds/events/EImportMap.h +++ b/trunk/ceph/mds/events/EImportMap.h @@ -20,41 +20,36 @@ class EImportMap : public LogEvent { public: EMetaBlob metablob; - set imports; - set exports; - //set hashdirs; - map > nested_exports; + set imports; + map > bounds; EImportMap() : LogEvent(EVENT_IMPORTMAP) { } void print(ostream& out) { - out << "import_map " << imports.size() << " imports, " - << exports.size() << " exports" - << " " << metablob; + out << "import_map " << imports.size() << " imports " + << metablob; } void encode_payload(bufferlist& bl) { metablob._encode(bl); ::_encode(imports, bl); - ::_encode(exports, bl); - for (set::iterator p = imports.begin(); + for (set::iterator p = imports.begin(); p != imports.end(); ++p) { - ::_encode(nested_exports[*p], bl); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); + ::_encode(bounds[*p], bl); + if (bounds[*p].empty()) + bounds.erase(*p); } } void decode_payload(bufferlist& bl, int& off) { metablob._decode(bl, off); ::_decode(imports, bl, off); - ::_decode(exports, bl, off); - for (set::iterator p = imports.begin(); + for (set::iterator p = imports.begin(); p != imports.end(); ++p) { - ::_decode(nested_exports[*p], bl, off); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); + ::_decode(bounds[*p], bl, off); + if (bounds[*p].empty()) + bounds.erase(*p); } } diff --git a/trunk/ceph/mds/events/EImportStart.h b/trunk/ceph/mds/events/EImportStart.h index 59c074dec6f4f..742de69860735 100644 --- a/trunk/ceph/mds/events/EImportStart.h +++ b/trunk/ceph/mds/events/EImportStart.h @@ -24,29 +24,29 @@ class EImportStart : public LogEvent { protected: - inodeno_t dirino; - list bounds; + dirfrag_t base; + list bounds; public: EMetaBlob metablob; - EImportStart(inodeno_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - dirino(di), bounds(b) { } + EImportStart(dirfrag_t di, + list& b) : LogEvent(EVENT_IMPORTSTART), + base(di), bounds(b) { } EImportStart() : LogEvent(EVENT_IMPORTSTART) { } void print(ostream& out) { - out << "EImportStart " << metablob; + out << "EImportStart " << base << " " << metablob; } virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&base, sizeof(base)); metablob._encode(bl); ::_encode(bounds, bl); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); + bl.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); metablob._decode(bl, off); ::_decode(bounds, bl, off); } diff --git a/trunk/ceph/mds/events/EMetaBlob.h b/trunk/ceph/mds/events/EMetaBlob.h index 800c6674c91a8..f89456697b996 100644 --- a/trunk/ceph/mds/events/EMetaBlob.h +++ b/trunk/ceph/mds/events/EMetaBlob.h @@ -22,7 +22,6 @@ using namespace std; #include "../CDir.h" #include "../CDentry.h" - class MDS; /* @@ -129,11 +128,10 @@ class EMetaBlob { /* dirlump - contains metadata for any dir we have contents for. */ struct dirlump { - static const int STATE_IMPORT = (1<<0); static const int STATE_COMPLETE = (1<<1); static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - dirslice_t dirslice; + dirfrag_t dirfrag; version_t dirv; int state; int nfull, nremote, nnull; @@ -148,8 +146,6 @@ class EMetaBlob { public: dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - bool is_import() { return state & STATE_IMPORT; } - void mark_import() { state |= STATE_IMPORT; } bool is_complete() { return state & STATE_COMPLETE; } void mark_complete() { state |= STATE_COMPLETE; } bool is_dirty() { return state & STATE_DIRTY; } @@ -182,7 +178,7 @@ class EMetaBlob { } void _encode(bufferlist& bl) { - bl.append((char*)&dirslice, sizeof(dirslice)); + bl.append((char*)&dirfrag, sizeof(dirfrag)); bl.append((char*)&dirv, sizeof(dirv)); bl.append((char*)&state, sizeof(state)); bl.append((char*)&nfull, sizeof(nfull)); @@ -194,7 +190,7 @@ class EMetaBlob { ::_encode(bnull, bl); } void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirslice), (char*)&dirslice); off += sizeof(dirslice); + bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); off += sizeof(dirfrag); bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv); bl.copy(off, sizeof(state), (char*)&state); off += sizeof(state); bl.copy(off, sizeof(nfull), (char*)&nfull); off += sizeof(nfull); @@ -209,84 +205,123 @@ class EMetaBlob { }; // my lumps. preserve the order we added them in a list. - list lump_order; - map lump_map; + list lump_order; + map lump_map; + + // anchor transactions included in this update. + list atids; + + // inodes i've destroyed. + list< pair > truncated_inodes; + + // idempotent op(s) + list client_reqs; public: + + void add_client_req(metareqid_t r) { + client_reqs.push_back(r); + } + + void add_anchor_transaction(version_t atid) { + atids.push_back(atid); + } + + void add_inode_truncate(const inode_t& inode, off_t newsize) { + truncated_inodes.push_back(pair(inode, newsize)); + } - // remote pointer to to-be-journaled inode iff it's a normal (non-remote) dentry - inode_t *add_dentry(CDentry *dn, bool dirty, CInode *in=0) { - CDir *dir = dn->get_dir(); + void add_null_dentry(CDentry *dn, bool dirty) { + // add the dir + dirlump& lump = add_dir(dn->get_dir(), false); + + lump.nnull++; + if (dirty) + lump.get_dnull().push_front(nullbit(dn->get_name(), + dn->get_projected_version(), + dirty)); + else + lump.get_dnull().push_back(nullbit(dn->get_name(), + dn->get_projected_version(), + dirty)); + } + + void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { + if (!rino) + rino = dn->get_remote_ino(); + + dirlump& lump = add_dir(dn->get_dir(), false); + + lump.nremote++; + if (dirty) + lump.get_dremote().push_front(remotebit(dn->get_name(), + dn->get_projected_version(), + rino, + dirty)); + else + lump.get_dremote().push_back(remotebit(dn->get_name(), + dn->get_projected_version(), + rino, + dirty)); + } + + // return remote pointer to to-be-journaled inode + inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) { if (!in) in = dn->get_inode(); - // add the dir - dirlump& lump = add_dir(dir, false); + dirlump& lump = add_dir(dn->get_dir(), false); + + lump.nfull++; + if (dirty) { + lump.get_dfull().push_front(fullbit(dn->get_name(), + dn->get_projected_version(), + in->inode, in->symlink, + dirty)); + return &lump.get_dfull().front().inode; + } else { + lump.get_dfull().push_back(fullbit(dn->get_name(), + dn->get_projected_version(), + in->inode, in->symlink, + dirty)); + return &lump.get_dfull().back().inode; + } + } - // add the dirbit + // convenience: primary or remote? figure it out. + inode_t *add_dentry(CDentry *dn, bool dirty) { + // primary or remote if (dn->is_remote()) { - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - } - else if (!in) { - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); + add_remote_dentry(dn, dirty); + return 0; + } else if (dn->is_null()) { + add_null_dentry(dn, dirty); + return 0; } - else { - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().back().inode; - } - } - return 0; + assert(dn->is_primary()); + return add_primary_dentry(dn, dirty); } + dirlump& add_dir(CDir *dir, bool dirty) { - if (lump_map.count(dir->ino()) == 0) { - lump_order.push_back(dir->ino()); - lump_map[dir->ino()].dirv = dir->get_projected_version(); + dirfrag_t df = dir->dirfrag(); + if (lump_map.count(df) == 0) { + lump_order.push_back(df); + lump_map[df].dirv = dir->get_projected_version(); } - dirlump& l = lump_map[dir->ino()]; + dirlump& l = lump_map[df]; if (dir->is_complete()) l.mark_complete(); - if (dir->is_import()) l.mark_import(); if (dirty) l.mark_dirty(); return l; } void add_dir_context(CDir *dir, bool toroot=false) { // already have this dir? (we must always add in order) - if (lump_map.count(dir->ino())) + if (lump_map.count(dir->dirfrag())) return; CInode *diri = dir->get_inode(); - if (!toroot && - (dir->is_import() || dir->is_hashed())) - return; // stop at import point + if (!toroot && dir->is_subtree_root() && dir->is_auth()) + return; // stop at subtree root if (!dir->get_inode()->get_parent_dn()) return; @@ -301,29 +336,39 @@ class EMetaBlob { void _encode(bufferlist& bl) { int n = lump_map.size(); bl.append((char*)&n, sizeof(n)); - for (list::iterator i = lump_order.begin(); + for (list::iterator i = lump_order.begin(); i != lump_order.end(); ++i) { bl.append((char*)&(*i), sizeof(*i)); lump_map[*i]._encode(bl); } + ::_encode(atids, bl); + ::_encode(truncated_inodes, bl); + ::_encode(client_reqs, bl); } void _decode(bufferlist& bl, int& off) { int n; bl.copy(off, sizeof(n), (char*)&n); off += sizeof(n); for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EOPEN_H +#define __MDS_EOPEN_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class EOpen : public LogEvent { +public: + EMetaBlob metablob; + list inos; + + EOpen() : LogEvent(EVENT_OPEN) { } + EOpen(CInode *in) : LogEvent(EVENT_OPEN) { + add_inode(in); + } + void print(ostream& out) { + out << "EOpen " << metablob; + } + + void add_inode(CInode *in) { + inos.push_back(in->ino()); + metablob.add_primary_dentry(in->get_parent_dn(), false); + } + + void encode_payload(bufferlist& bl) { + ::_encode(inos, bl); + metablob._encode(bl); + } + void decode_payload(bufferlist& bl, int& off) { + ::_decode(inos, bl, off); + metablob._decode(bl, off); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); +}; + +#endif diff --git a/trunk/ceph/mds/events/EPurgeFinish.h b/trunk/ceph/mds/events/EPurgeFinish.h index b00f5f90313fc..e7714b4c3d051 100644 --- a/trunk/ceph/mds/events/EPurgeFinish.h +++ b/trunk/ceph/mds/events/EPurgeFinish.h @@ -21,22 +21,27 @@ class EPurgeFinish : public LogEvent { protected: inodeno_t ino; + off_t newsize; public: - EPurgeFinish(inodeno_t i) : + EPurgeFinish(inodeno_t i, off_t s) : LogEvent(EVENT_PURGEFINISH), - ino(i) { } + ino(i), newsize(s) { } EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } void print(ostream& out) { - out << "purgefinish " << ino; + out << "purgefinish " << ino << " to " << newsize; } virtual void encode_payload(bufferlist& bl) { bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&newsize, sizeof(newsize)); } void decode_payload(bufferlist& bl, int& off) { bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + bl.copy(off, sizeof(newsize), (char*)&newsize); + off += sizeof(newsize); } bool has_expired(MDS *mds); diff --git a/trunk/ceph/mds/events/ESession.h b/trunk/ceph/mds/events/ESession.h new file mode 100644 index 0000000000000..7713493c1f344 --- /dev/null +++ b/trunk/ceph/mds/events/ESession.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_ESESSION_H +#define __MDS_ESESSION_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" + +class ESession : public LogEvent { + protected: + entity_inst_t client_inst; + bool open; // open or close + version_t cmapv; // client map version + + public: + ESession() : LogEvent(EVENT_SESSION) { } + ESession(entity_inst_t inst, bool o, version_t v) : + LogEvent(EVENT_SESSION), + client_inst(inst), + open(o), + cmapv(v) { + } + + void encode_payload(bufferlist& bl) { + ::_encode(client_inst, bl); + ::_encode(open, bl); + ::_encode(cmapv, bl); + } + void decode_payload(bufferlist& bl, int& off) { + ::_decode(client_inst, bl, off); + ::_decode(open, bl, off); + ::_decode(cmapv, bl, off); + } + + + void print(ostream& out) { + if (open) + out << "ESession " << client_inst << " open cmapv " << cmapv; + else + out << "ESession " << client_inst << " close cmapv " << cmapv; + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/trunk/ceph/mds/events/ESlaveUpdate.h b/trunk/ceph/mds/events/ESlaveUpdate.h new file mode 100644 index 0000000000000..fc673d082d12f --- /dev/null +++ b/trunk/ceph/mds/events/ESlaveUpdate.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_ESLAVEUPDATE_H +#define __MDS_ESLAVEUPDATE_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class ESlaveUpdate : public LogEvent { +public: + string type; + metareqid_t reqid; + int op; // prepare, commit, abort + EMetaBlob metablob; + + ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } + ESlaveUpdate(const char *s, metareqid_t ri, int o) : + LogEvent(EVENT_SLAVEUPDATE), + type(s), + reqid(ri), + op(o) { } + + void print(ostream& out) { + if (type.length()) + out << type << " "; + out << " " << op; + out << " " << reqid; + out << metablob; + } + + void encode_payload(bufferlist& bl) { + ::_encode(type, bl); + ::_encode(reqid, bl); + ::_encode(op, bl); + metablob._encode(bl); + } + void decode_payload(bufferlist& bl, int& off) { + ::_decode(type, bl, off); + ::_decode(reqid, bl, off); + ::_decode(op, bl, off); + metablob._decode(bl, off); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); +}; + +#endif diff --git a/trunk/ceph/mds/events/EUnlink.h b/trunk/ceph/mds/events/EUnlink.h deleted file mode 100644 index 7d972488dab1b..0000000000000 --- a/trunk/ceph/mds/events/EUnlink.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EUNLINK_H -#define __EUNLINK_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -#include "../CInode.h" -#include "../CDentry.h" -#include "../CDir.h" - -/// help rewrite me - -class EUnlink : public LogEvent { - protected: - version_t dirv; - string dname; - - public: - EMetaBlob metaglob; - - /* - EUnlink(CDir *dir, CDentry* dn, CInode *in) : - LogEvent(EVENT_UNLINK), - diritrace(dir->inode), - dirv(dir->get_version()), - dname(dn->get_name()), - inodetrace(in) {} - */ - EUnlink() : LogEvent(EVENT_UNLINK) { } - - virtual void encode_payload(bufferlist& bl) { - /* - diritrace.encode(bl); - bl.append((char*)&dirv, sizeof(dirv)); - ::_encode(dname, bl); - inodetrace.encode(bl); - */ - } - void decode_payload(bufferlist& bl, int& off) { - /* - diritrace.decode(bl,off); - bl.copy(off, sizeof(dirv), (char*)&dirv); - off += sizeof(dirv); - ::_decode(dname, bl, off); - inodetrace.decode(bl, off); - */ - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc index 2182d33ffc878..a298ee8cb8520 100644 --- a/trunk/ceph/mds/journal.cc +++ b/trunk/ceph/mds/journal.cc @@ -12,24 +12,33 @@ */ #include "events/EString.h" +#include "events/EImportMap.h" +#include "events/ESession.h" +#include "events/EClientMap.h" #include "events/EMetaBlob.h" -#include "events/EAlloc.h" + #include "events/EUpdate.h" -#include "events/EImportMap.h" +#include "events/ESlaveUpdate.h" +#include "events/EOpen.h" +#include "events/EAlloc.h" #include "events/EPurgeFinish.h" -#include "events/EUnlink.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" + +#include "events/EExport.h" #include "events/EImportStart.h" #include "events/EImportFinish.h" +#include "events/EAnchor.h" +#include "events/EAnchorClient.h" + #include "MDS.h" #include "MDLog.h" #include "MDCache.h" -#include "MDStore.h" +#include "Server.h" #include "Migrator.h" +#include "AnchorTable.h" +#include "AnchorClient.h" #include "config.h" #undef dout @@ -63,110 +72,211 @@ void EString::replay(MDS *mds) * * - been safely committed to its dirslice. * - * - has been safely exported. note that !is_auth() && !is_proxy() - * implies safely exported. if !is_auth() && is_proxy(), we need to - * add a waiter for the export to complete. + * - has been safely exported. i.e., authority().first != us. + * in particular, auth of is not enough, we need to + * wait for . + * + * note that this check is overly conservative, in that we'll + * try to flush the dir again if we reimport the subtree, even though + * later journal entries contain the same dirty data (from the import). * */ bool EMetaBlob::has_expired(MDS *mds) { // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); + for (map::iterator lp = lump_map.begin(); lp != lump_map.end(); ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(lp->first); if (!dir) continue; // we expired it // FIXME: check the slice only - if (dir->is_proxy()) { - dout(10) << "EMetaBlob.has_expired am proxy, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // we need to wait until the export flushes! - } - if (!dir->is_auth()) { + if (dir->authority().first != mds->get_nodeid()) { dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv << " for " << *dir << endl; continue; // not our problem } + if (dir->get_committed_version() >= lp->second.dirv) { + dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv + << " for " << *dir << endl; + continue; // yay + } + + if (dir->is_ambiguous_dir_auth()) { + CDir *ex = mds->mdcache->get_subtree_root(dir); + if (ex->is_exporting()) { + // wait until export is acked (logged on remote) and committed (logged locally) + dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir + << ", exporting on " << *ex << endl; + return false; + } else { + dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir + << ", importing on " << *ex << endl; + return false; + } + } - if (dir->get_last_committed_version() < lp->second.dirv) { + if (dir->get_committed_version() < lp->second.dirv) { dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv << " for " << *dir << endl; return false; // not committed. - } else { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << endl; + } + + assert(0); // i goofed the logic + } + + // have my anchortable ops committed? + for (list::iterator p = atids.begin(); + p != atids.end(); + ++p) { + if (!mds->anchorclient->has_committed(*p)) { + dout(10) << "EMetaBlob.has_expired anchor transaction " << *p + << " not yet acked" << endl; + return false; } } - return true; // all dirlumps expired. + // truncated inodes + for (list< pair >::iterator p = truncated_inodes.begin(); + p != truncated_inodes.end(); + ++p) { + if (mds->mdcache->is_purging(p->first.ino, p->second)) { + dout(10) << "EMetaBlob.has_expired still purging inode " << p->first.ino + << " to " << p->second << endl; + return false; + } + } + + // client requests + for (list::iterator p = client_reqs.begin(); + p != client_reqs.end(); + ++p) { + if (mds->clientmap.have_completed_request(*p)) { + dout(10) << "EMetaBlob.has_expired still have completed request " << *p + << endl; + return false; + } + } + + + return true; // all dirlumps expired, etc. } + void EMetaBlob::expire(MDS *mds, Context *c) { - list commit; + map commit; // dir -> version needed list waitfor_export; + list waitfor_import; int ncommit = 0; // examine dirv's for my lumps // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); + for (map::iterator lp = lump_map.begin(); lp != lump_map.end(); ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(lp->first); if (!dir) continue; // we expired it // FIXME: check the slice only - if (dir->is_proxy()) { - // wait until export is acked (logged on remote) and committed (logged locally) - CDir *ex = mds->mdcache->get_export_container(dir); - dout(10) << "EMetaBlob.expire proxy for " << *dir - << ", waiting for export finish on " << *ex << endl; - waitfor_export.push_back(ex); - continue; - } - if (!dir->is_auth()) { + if (dir->authority().first != mds->get_nodeid()) { dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv << " for " << *dir << endl; continue; // not our problem } - if (dir->get_last_committed_version() < lp->second.dirv) { + if (dir->get_committed_version() >= lp->second.dirv) { + dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv + << " on " << *dir << endl; + continue; // yay + } + + if (dir->is_ambiguous_dir_auth()) { + CDir *ex = mds->mdcache->get_subtree_root(dir); + if (ex->is_exporting()) { + // wait until export is acked (logged on remote) and committed (logged locally) + dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir + << ", waiting for export finish on " << *ex << endl; + waitfor_export.push_back(ex); + continue; + } else { + dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir + << ", waiting for import finish on " << *ex << endl; + waitfor_import.push_back(ex); + continue; + } + } + if (dir->get_committed_version() < lp->second.dirv) { dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv << ", committing " << *dir << endl; - commit.push_back(dir); + commit[dir] = MAX(commit[dir], lp->second.dirv); ncommit++; - } else { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << endl; + continue; } + + assert(0); // hrm } - // commit - assert(!commit.empty()); + // set up gather context + C_Gather *gather = new C_Gather(c); - if (ncommit == 1) { - mds->mdstore->commit_dir(commit.front(), c); - } else { - C_Gather *gather = new C_Gather(c); - for (list::iterator p = commit.begin(); - p != commit.end(); - ++p) - mds->mdstore->commit_dir(*p, gather->new_sub()); - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); + // do or wait for exports and commits + for (map::iterator p = commit.begin(); + p != commit.end(); + ++p) { + if (p->first->can_auth_pin()) + p->first->commit(p->second, gather->new_sub()); + else + // pbly about to export|split|merge. + // just wait for it to unfreeze, then retry + p->first->add_waiter(CDir::WAIT_AUTHPINNABLE, gather->new_sub()); + } + for (list::iterator p = waitfor_export.begin(); + p != waitfor_export.end(); + ++p) + mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); + for (list::iterator p = waitfor_import.begin(); + p != waitfor_import.end(); + ++p) + (*p)->add_waiter(CDir::WAIT_IMPORTED, gather->new_sub()); + + + // have my anchortable ops committed? + for (list::iterator p = atids.begin(); + p != atids.end(); + ++p) { + if (!mds->anchorclient->has_committed(*p)) { + dout(10) << "EMetaBlob.expire anchor transaction " << *p + << " not yet acked, waiting" << endl; + mds->anchorclient->wait_for_ack(*p, gather->new_sub()); + } + } + + // truncated inodes + for (list< pair >::iterator p = truncated_inodes.begin(); + p != truncated_inodes.end(); + ++p) { + if (mds->mdcache->is_purging(p->first.ino, p->second)) { + dout(10) << "EMetaBlob.expire waiting for purge of inode " << p->first.ino + << " to " << p->second << endl; + mds->mdcache->wait_for_purge(p->first.ino, p->second, gather->new_sub()); + } + } + + // client requests + for (list::iterator p = client_reqs.begin(); + p != client_reqs.end(); + ++p) { + if (mds->clientmap.have_completed_request(*p)) { + dout(10) << "EMetaBlob.expire waiting on completed request " << *p + << endl; + mds->clientmap.add_trim_waiter(*p, gather->new_sub()); + } } + } void EMetaBlob::replay(MDS *mds) @@ -174,27 +284,33 @@ void EMetaBlob::replay(MDS *mds) dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); + for (list::iterator lp = lump_order.begin(); lp != lump_order.end(); ++lp) { dout(10) << "EMetaBlob.replay dir " << *lp << endl; dirlump &lump = lump_map[*lp]; // the dir - CInode *diri = mds->mdcache->get_inode(*lp); - CDir *dir; - if (!diri) { - assert(*lp == 1); - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << endl; - } - if (diri->dir) { - dir = diri->dir; - dout(20) << "EMetaBlob.replay had dir " << *dir << endl; - } else { - dir = diri->get_or_open_dir(mds->mdcache); - if (*lp == 1) - dir->set_dir_auth(CDIR_AUTH_UNKNOWN); + CDir *dir = mds->mdcache->get_dirfrag(*lp); + if (!dir) { + // hmm. do i have the inode? + CInode *diri = mds->mdcache->get_inode((*lp).ino); + if (!diri) { + if ((*lp).ino == MDS_INO_ROOT) { + diri = mds->mdcache->create_root_inode(); + dout(10) << "EMetaBlob.replay created root " << *diri << endl; + } else if (MDS_INO_IS_STRAY((*lp).ino)) { + int whose = (*lp).ino - MDS_INO_STRAY_OFFSET; + diri = mds->mdcache->create_stray_inode(whose); + dout(10) << "EMetaBlob.replay created stray " << *diri << endl; + } else { + assert(0); + } + } + // create the dirfrag + dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); + if ((*lp).ino == 1) + dir->set_dir_auth(CDIR_AUTH_UNKNOWN); // FIXME: can root dir be fragmented? hrm. dout(10) << "EMetaBlob.replay added dir " << *dir << endl; } dir->set_version( lump.dirv ); @@ -210,27 +326,37 @@ void EMetaBlob::replay(MDS *mds) for (list::iterator p = lump.get_dfull().begin(); p != lump.get_dfull().end(); p++) { + CDentry *dn = dir->lookup(p->dn); + if (!dn) { + dn = dir->add_dentry( p->dn ); + dn->set_version(p->dnv); + if (p->dirty) dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay added " << *dn << endl; + } else { + dn->set_version(p->dnv); + if (p->dirty) dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay had " << *dn << endl; + } + CInode *in = mds->mdcache->get_inode(p->inode.ino); if (!in) { - // inode in = new CInode(mds->mdcache); in->inode = p->inode; if (in->inode.is_symlink()) in->symlink = p->symlink; mds->mdcache->add_inode(in); - // dentry - CDentry *dn = dir->add_dentry( p->dn, in ); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << " " << *in << endl; + dir->link_inode(dn, in); + if (p->dirty) in->_mark_dirty(); + dout(10) << "EMetaBlob.replay added " << *in << endl; } else { - // inode + if (in->get_parent_dn()) { + dout(10) << "EMetaBlob.replay unlinking " << *in << endl; + in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); + } in->inode = p->inode; if (in->inode.is_symlink()) in->symlink = p->symlink; - // dentry - CDentry *dn = in->get_parent_dn(); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *in->get_parent_dn() << " " << *in << endl; + dir->link_inode(dn, in); + if (p->dirty) in->_mark_dirty(); + dout(10) << "EMetaBlob.replay linked " << *in << endl; } } @@ -243,12 +369,16 @@ void EMetaBlob::replay(MDS *mds) dn = dir->add_dentry(p->dn, p->ino); dn->set_remote_ino(p->ino); dn->set_version(p->dnv); - dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *dn << endl; } else { + if (!dn->is_null()) { + dout(10) << "EMetaBlob.replay unlinking " << *dn << endl; + dir->unlink_inode(dn); + } dn->set_remote_ino(p->ino); dn->set_version(p->dnv); - dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay had " << *dn << endl; } } @@ -261,15 +391,124 @@ void EMetaBlob::replay(MDS *mds) if (!dn) { dn = dir->add_dentry(p->dn); dn->set_version(p->dnv); - dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *dn << endl; } else { + if (!dn->is_null()) { + dout(10) << "EMetaBlob.replay unlinking " << *dn << endl; + dir->unlink_inode(dn); + } dn->set_version(p->dnv); - dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay had " << *dn << endl; } } } + + // anchor transactions + for (list::iterator p = atids.begin(); + p != atids.end(); + ++p) { + dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << endl; + mds->anchorclient->got_journaled_agree(*p); + } + + // truncated inodes + for (list< pair >::iterator p = truncated_inodes.begin(); + p != truncated_inodes.end(); + ++p) { + dout(10) << "EMetaBlob.replay will purge truncated inode " << p->first.ino + << " to " << p->second << endl; + mds->mdcache->add_recovered_purge(p->first, p->second); + } + + // client requests + for (list::iterator p = client_reqs.begin(); + p != client_reqs.end(); + ++p) + mds->clientmap.add_completed_request(*p); +} + +// ----------------------- +// EClientMap + +bool EClientMap::has_expired(MDS *mds) +{ + if (mds->clientmap.get_committed() >= cmapv) { + dout(10) << "EClientMap.has_expired newer clientmap " << mds->clientmap.get_committed() + << " >= " << cmapv << " has committed" << endl; + return true; + } else if (mds->clientmap.get_committing() >= cmapv) { + dout(10) << "EClientMap.has_expired newer clientmap " << mds->clientmap.get_committing() + << " >= " << cmapv << " is still committing" << endl; + return false; + } else { + dout(10) << "EClientMap.has_expired clientmap " << mds->clientmap.get_version() + << " not empty" << endl; + return false; + } +} + +void EClientMap::expire(MDS *mds, Context *c) +{ + if (mds->clientmap.get_committing() >= cmapv) { + dout(10) << "EClientMap.expire logging clientmap" << endl; + assert(mds->clientmap.get_committing() > mds->clientmap.get_committed()); + mds->clientmap.add_commit_waiter(c); + } else { + dout(10) << "EClientMap.expire logging clientmap" << endl; + mds->log_clientmap(c); + } +} + +void EClientMap::replay(MDS *mds) +{ + dout(10) << "EClientMap.replay v " << cmapv << endl; + int off = 0; + mds->clientmap.decode(mapbl, off); + mds->clientmap.set_committed(mds->clientmap.get_version()); + mds->clientmap.set_committing(mds->clientmap.get_version()); +} + + +// ESession +bool ESession::has_expired(MDS *mds) +{ + if (mds->clientmap.get_committed() >= cmapv) { + dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committed() + << " >= " << cmapv << " has committed" << endl; + return true; + } else if (mds->clientmap.get_committing() >= cmapv) { + dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committing() + << " >= " << cmapv << " is still committing" << endl; + return false; + } else { + dout(10) << "ESession.has_expired clientmap " << mds->clientmap.get_version() + << " not empty" << endl; + return false; + } +} + +void ESession::expire(MDS *mds, Context *c) +{ + if (mds->clientmap.get_committing() >= cmapv) { + dout(10) << "ESession.expire logging clientmap" << endl; + assert(mds->clientmap.get_committing() > mds->clientmap.get_committed()); + mds->clientmap.add_commit_waiter(c); + } else { + dout(10) << "ESession.expire logging clientmap" << endl; + mds->log_clientmap(c); + } +} + +void ESession::replay(MDS *mds) +{ + dout(10) << "ESession.replay" << endl; + if (open) + mds->clientmap.open_session(client_inst); + else + mds->clientmap.close_session(client_inst.name.num()); + mds->clientmap.reset_projected(); // make it follow version. } @@ -322,6 +561,91 @@ void EAlloc::replay(MDS *mds) } +// ----------------------- +// EAnchor + +bool EAnchor::has_expired(MDS *mds) +{ + version_t cv = mds->anchortable->get_committed_version(); + if (cv < version) { + dout(10) << "EAnchor.has_expired v " << version << " > " << cv + << ", still dirty" << endl; + return false; // still dirty + } else { + dout(10) << "EAnchor.has_expired v " << version << " <= " << cv + << ", already flushed" << endl; + return true; // already flushed + } +} + +void EAnchor::expire(MDS *mds, Context *c) +{ + dout(10) << "EAnchor.expire saving anchor table" << endl; + mds->anchortable->save(c); +} + +void EAnchor::replay(MDS *mds) +{ + if (mds->anchortable->get_version() >= version) { + dout(10) << "EAnchor.replay event " << version + << " <= table " << mds->anchortable->get_version() << endl; + } else { + dout(10) << " EAnchor.replay event " << version + << " - 1 == table " << mds->anchortable->get_version() << endl; + assert(version-1 == mds->anchortable->get_version()); + + switch (op) { + // anchortable + case ANCHOR_OP_CREATE_PREPARE: + mds->anchortable->create_prepare(ino, trace, reqmds); + break; + case ANCHOR_OP_DESTROY_PREPARE: + mds->anchortable->destroy_prepare(ino, reqmds); + break; + case ANCHOR_OP_UPDATE_PREPARE: + mds->anchortable->update_prepare(ino, trace, reqmds); + break; + case ANCHOR_OP_COMMIT: + mds->anchortable->commit(atid); + break; + + default: + assert(0); + } + + assert(version == mds->anchortable->get_version()); + } +} + + +// EAnchorClient + +bool EAnchorClient::has_expired(MDS *mds) +{ + return true; +} + +void EAnchorClient::expire(MDS *mds, Context *c) +{ + assert(0); +} + +void EAnchorClient::replay(MDS *mds) +{ + dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << endl; + + switch (op) { + // anchorclient + case ANCHOR_OP_ACK: + mds->anchorclient->got_journaled_ack(atid); + break; + + default: + assert(0); + } +} + + // ----------------------- // EUpdate @@ -341,6 +665,74 @@ void EUpdate::replay(MDS *mds) } +// ------------------------ +// EOpen + +bool EOpen::has_expired(MDS *mds) +{ + for (list::iterator p = inos.begin(); p != inos.end(); ++p) { + CInode *in = mds->mdcache->get_inode(*p); + if (in && + in->is_any_caps() && + !(in->last_open_journaled > get_start_off() || + in->last_open_journaled == 0)) { + dout(10) << "EOpen.has_expired still refer to caps on " << *in << endl; + return false; + } + } + return true; +} + +void EOpen::expire(MDS *mds, Context *c) +{ + dout(10) << "EOpen.expire " << endl; + + if (mds->mdlog->is_capped()) { + dout(0) << "uh oh, log is capped, but i have unexpired opens." << endl; + assert(0); + } + + for (list::iterator p = inos.begin(); p != inos.end(); ++p) { + CInode *in = mds->mdcache->get_inode(*p); + if (!in) continue; + if (!in->is_any_caps()) continue; + + dout(10) << "EOpen.expire " << in->ino() + << " last_open_journaled " << in->last_open_journaled << endl; + + mds->server->queue_journal_open(in); + } + mds->server->add_journal_open_waiter(c); + mds->server->maybe_journal_opens(); +} + +void EOpen::replay(MDS *mds) +{ + dout(10) << "EOpen.replay " << endl; + metablob.replay(mds); +} + + +// ----------------------- +// ESlaveUpdate + +bool ESlaveUpdate::has_expired(MDS *mds) +{ + return true; + //return metablob.has_expired(mds); +} + +void ESlaveUpdate::expire(MDS *mds, Context *c) +{ + metablob.expire(mds, c); +} + +void ESlaveUpdate::replay(MDS *mds) +{ + //metablob.replay(mds); +} + + // ----------------------- // EImportMap @@ -384,63 +776,23 @@ void EImportMap::expire(MDS *mds, Context *c) void EImportMap::replay(MDS *mds) { - dout(10) << "EImportMap.replay -- reconstructing import/export spanning tree" << endl; - assert(mds->mdcache->imports.empty()); - - // first, stick the spanning tree in my cache - metablob.replay(mds); - - // restore import/export maps - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - mds->mdcache->add_ambiguous_import(*p, nested_exports[*p]); - mds->mdcache->finish_ambiguous_import(*p); - } - - mds->mdcache->show_imports(); -} - - - -// ----------------------- -// EUnlink - -bool EUnlink::has_expired(MDS *mds) -{ - /* - // dir - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = 0; - if (diri) dir = diri->dir; - - if (dir && dir->get_last_committed_version() < dirv) return false; - - if (!inodetrace.trace.empty()) { - // inode - CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino ); - if (in && in->get_last_committed_version() < inodetrace.back().inode.version) - return false; + if (mds->mdcache->is_subtrees()) { + dout(10) << "EImportMap.replay -- ignoring, already have import map" << endl; + } else { + dout(10) << "EImportMap.replay -- reconstructing (auth) subtree spanning tree" << endl; + + // first, stick the spanning tree in my cache + metablob.replay(mds); + + // restore import/export maps + for (set::iterator p = imports.begin(); + p != imports.end(); + ++p) { + CDir *dir = mds->mdcache->get_dirfrag(*p); + mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid()); + } } - */ - return true; -} - -void EUnlink::expire(MDS *mds, Context *c) -{ - /* - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = diri->dir; - assert(dir); - - // okay! - dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; - mds->mdstore->commit_dir(dir, dirv, c); - */ -} - -void EUnlink::replay(MDS *mds) -{ + mds->mdcache->show_subtrees(); } @@ -457,10 +809,13 @@ bool EPurgeFinish::has_expired(MDS *mds) void EPurgeFinish::expire(MDS *mds, Context *c) { + assert(0); } void EPurgeFinish::replay(MDS *mds) { + dout(10) << "EPurgeFinish.replay " << ino << " to " << newsize << endl; + mds->mdcache->remove_recovered_purge(ino, newsize); } @@ -470,67 +825,48 @@ void EPurgeFinish::replay(MDS *mds) // ========================================================================= // ----------------------- -// EExportStart +// EExport -bool EExportStart::has_expired(MDS *mds) +bool EExport::has_expired(MDS *mds) { - CInode *diri = mds->mdcache->get_inode(dirino); - if (!diri) return true; - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(base); if (!dir) return true; if (!mds->mdcache->migrator->is_exporting(dir)) return true; - dout(10) << "EExportStart.has_expired still exporting " << *dir << endl; + dout(10) << "EExport.has_expired still exporting " << *dir << endl; return false; } -void EExportStart::expire(MDS *mds, Context *c) +void EExport::expire(MDS *mds, Context *c) { - CInode *diri = mds->mdcache->get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; + CDir *dir = mds->mdcache->get_dirfrag(base); assert(dir); assert(mds->mdcache->migrator->is_exporting(dir)); - dout(10) << "EExportStart.expire waiting for export of " << *dir << endl; + dout(10) << "EExport.expire waiting for export of " << *dir << endl; mds->mdcache->migrator->add_export_finish_waiter(dir, c); } -void EExportStart::replay(MDS *mds) +void EExport::replay(MDS *mds) { - dout(10) << "EExportStart.replay " << dirino << " -> " << dest << endl; + dout(10) << "EExport.replay " << base << endl; metablob.replay(mds); - // put in pending_exports lists - mds->mdlog->pending_exports[dirino] = bounds; -} - -// ----------------------- -// EExportFinish - -bool EExportFinish::has_expired(MDS *mds) -{ - // we can always expire. - return true; -} - -void EExportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // should never happen. -} - -void EExportFinish::replay(MDS *mds) -{ - dout(10) << "EExportFinish.replay " << dirino << " success=" << success << endl; - - assert(mds->mdlog->pending_exports.count(dirino)); - - // finish? - if (success) - mds->mdcache->finish_ambiguous_export(dirino, mds->mdlog->pending_exports[dirino]); + CDir *dir = mds->mdcache->get_dirfrag(base); + assert(dir); + + set realbounds; + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = mds->mdcache->get_dirfrag(*p); + assert(bd); + realbounds.insert(bd); + } - // remove from pending_exports list - mds->mdlog->pending_exports.erase(dirino); + // adjust auth away + mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, pair(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)); + mds->mdcache->try_subtree_merge(dir); } @@ -544,22 +880,17 @@ bool EImportStart::has_expired(MDS *mds) void EImportStart::expire(MDS *mds, Context *c) { - dout(10) << "EImportStart.expire " << dirino << endl; + dout(10) << "EImportStart.expire " << base << endl; metablob.expire(mds, c); } void EImportStart::replay(MDS *mds) { - dout(10) << "EImportStart.replay " << dirino << endl; + dout(10) << "EImportStart.replay " << base << endl; metablob.replay(mds); - // convert list -> set - set b; - for (list::iterator p = bounds.begin(); p != bounds.end(); ++p) - b.insert(*p); - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(dirino, b); + mds->mdcache->add_ambiguous_import(base, bounds); } // ----------------------- @@ -576,11 +907,11 @@ void EImportFinish::expire(MDS *mds, Context *c) void EImportFinish::replay(MDS *mds) { - dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl; + dout(10) << "EImportFinish.replay " << base << " success=" << success << endl; if (success) - mds->mdcache->finish_ambiguous_import(dirino); + mds->mdcache->finish_ambiguous_import(base); else - mds->mdcache->cancel_ambiguous_import(dirino); + mds->mdcache->cancel_ambiguous_import(base); } diff --git a/trunk/ceph/mds/mdstypes.h b/trunk/ceph/mds/mdstypes.h index 1ac4525e76559..41b7f69e2e51b 100644 --- a/trunk/ceph/mds/mdstypes.h +++ b/trunk/ceph/mds/mdstypes.h @@ -10,37 +10,105 @@ using namespace std; #include "config.h" #include "common/DecayCounter.h" +#include "include/Context.h" #include +#include "include/frag.h" -// md ops -#define MDS_OP_STATFS 1 +#define MDS_PORT_MAIN 0 +#define MDS_PORT_SERVER 1 +#define MDS_PORT_CACHE 2 +#define MDS_PORT_LOCKER 3 +#define MDS_PORT_STORE 4 +#define MDS_PORT_BALANCER 5 +#define MDS_PORT_MIGRATOR 6 +#define MDS_PORT_RENAMER 7 +#define MDS_PORT_ANCHORCLIENT 10 +#define MDS_PORT_ANCHORTABLE 11 -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_UTIME 102 -#define MDS_OP_CHMOD 103 -#define MDS_OP_CHOWN 104 +#define MAX_MDS 0x100 +#define MDS_INO_ROOT 1 +#define MDS_INO_PGTABLE 2 +#define MDS_INO_ANCHORTABLE 3 +#define MDS_INO_LOG_OFFSET 0x100 +#define MDS_INO_IDS_OFFSET 0x200 +#define MDS_INO_STRAY_OFFSET 0x300 +#define MDS_INO_BASE 0x1000 -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 201 -#define MDS_OP_LINK 202 -#define MDS_OP_UNLINK 203 -#define MDS_OP_RENAME 204 +#define MDS_INO_STRAY(x) (MDS_INO_STRAY_OFFSET+((unsigned)x)) +#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < MDS_INO_STRAY_OFFSET+MAX_MDS) -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 -#define MDS_OP_SYMLINK 222 +#define MDS_TRAVERSE_FORWARD 1 +#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. +#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. +#define MDS_TRAVERSE_FAIL 4 -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 306 -#define MDS_OP_FSYNC 307 -//#define MDS_OP_CLOSE 310 -#define MDS_OP_RELEASE 308 +struct metareqid_t { + int client; + tid_t tid; + metareqid_t() : client(-1), tid(0) {} + metareqid_t(int c, tid_t t) : client(c), tid(t) {} +}; + +inline ostream& operator<<(ostream& out, const metareqid_t& r) { + return out << "client" << r.client << ":" << r.tid; +} + +inline bool operator==(const metareqid_t& l, const metareqid_t& r) { + return (l.client == r.client) && (l.tid == r.tid); +} +inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { + return (l.client != r.client) || (l.tid != r.tid); +} +inline bool operator<(const metareqid_t& l, const metareqid_t& r) { + return (l.client < r.client) || + (l.client == r.client && l.tid < r.tid); +} +inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { + return (l.client < r.client) || + (l.client == r.client && l.tid <= r.tid); +} +inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } +inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const metareqid_t &r) const { + hash<__uint64_t> H; + return H(r.client) ^ H(r.tid); + } + }; +} + + + + +// ================================================================ +// dir frag + +struct dirfrag_t { + inodeno_t ino; + frag_t frag; + + dirfrag_t() { } + dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } +}; + +inline ostream& operator<<(ostream& out, const dirfrag_t& df) { + return out << df.ino << "#" << df.frag; +} +inline bool operator<(dirfrag_t l, dirfrag_t r) { + if (l.ino < r.ino) return true; + if (l.ino == r.ino && l.frag < r.frag) return true; + return false; +} +inline bool operator==(dirfrag_t l, dirfrag_t r) { + return l.ino == r.ino && l.frag == r.frag; +} // ================================================================ @@ -165,62 +233,131 @@ inline mds_load_t operator/( mds_load_t& a, double d ) */ + + // ================================================================ -// dir slices -struct dirslice_t { - short hash_mask; - short hash_val; -}; +//#define MDS_PIN_REPLICATED 1 +//#define MDS_STATE_AUTH (1<<0) +class MLock; +class SimpleLock; +class MDSCacheObject; -// ================================================================ +// -- authority delegation -- +// directory authority types +// >= 0 is the auth mds +#define CDIR_AUTH_PARENT -1 // default +#define CDIR_AUTH_UNKNOWN -2 +#define CDIR_AUTH_DEFAULT pair(-1, -2) +#define CDIR_AUTH_UNDEF pair(-2, -2) +#define CDIR_AUTH_ROOTINODE pair( 0, -2) + + + +// print hack +struct mdsco_db_line_prefix { + MDSCacheObject *object; + mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} +}; +ostream& operator<<(ostream& out, mdsco_db_line_prefix o); + +// printer +ostream& operator<<(ostream& out, MDSCacheObject &o); -#define MDS_PIN_REPLICATED 1 class MDSCacheObject { - protected: - unsigned state; // state bits + public: + // -- pins -- + const static int PIN_REPLICATED = 1000; + const static int PIN_DIRTY = 1001; + const static int PIN_RDLOCK = -1002; + const static int PIN_XLOCK = 1003; + const static int PIN_REQUEST = -1004; + const static int PIN_WAITER = 1005; - int ref; // reference count - set ref_set; + const char *generic_pin_name(int p) { + switch (p) { + case PIN_REPLICATED: return "replicated"; + case PIN_DIRTY: return "dirty"; + case PIN_RDLOCK: return "rdlock"; + case PIN_XLOCK: return "xlock"; + case PIN_REQUEST: return "request"; + case PIN_WAITER: return "waiter"; + default: assert(0); + } + } - map replicas; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica + // -- state -- + const static int STATE_AUTH = (1<<30); + const static int STATE_DIRTY = (1<<29); + + // -- wait -- + const static int WAIT_SINGLEAUTH = (1<<30); + + // ============================================ + // cons public: MDSCacheObject() : state(0), ref(0), replica_nonce(0) {} virtual ~MDSCacheObject() {} + + // printing + virtual void print(ostream& out) = 0; + virtual ostream& print_db_line_prefix(ostream& out) { + return out << "mdscacheobject(" << this << ") "; + } // -------------------------------------------- // state + protected: + unsigned state; // state bits + + public: unsigned get_state() { return state; } void state_clear(unsigned mask) { state &= ~mask; } void state_set(unsigned mask) { state |= mask; } unsigned state_test(unsigned mask) { return state & mask; } void state_reset(unsigned s) { state = s; } + bool is_auth() { return state_test(STATE_AUTH); } + bool is_dirty() { return state & STATE_DIRTY; } + bool is_clean() { return !is_dirty(); } + + // -------------------------------------------- + // authority + virtual pair authority() = 0; + bool is_ambiguous_auth() { + return authority().second != CDIR_AUTH_UNKNOWN; + } + // -------------------------------------------- // pins +protected: + int ref; // reference count + multiset ref_set; + + public: int get_num_ref() { return ref; } bool is_pinned_by(int by) { return ref_set.count(by); } - set& get_ref_set() { return ref_set; } + multiset& get_ref_set() { return ref_set; } + virtual const char *pin_name(int by) = 0; virtual void last_put() {} virtual void bad_put(int by) { - assert(ref_set.count(by) == 1); + assert(ref_set.count(by) > 0); assert(ref > 0); } void put(int by) { - if (ref == 0 || ref_set.count(by) != 1) { + if (ref == 0 || ref_set.count(by) == 0) { bad_put(by); } else { ref--; - ref_set.erase(by); + ref_set.erase(ref_set.find(by)); assert(ref == (int)ref_set.size()); if (ref == 0) last_put(); @@ -229,11 +366,11 @@ class MDSCacheObject { virtual void first_get() {} virtual void bad_get(int by) { - assert(ref_set.count(by) == 0); + assert(by < 0 || ref_set.count(by) == 0); assert(0); } void get(int by) { - if (ref_set.count(by)) { + if (by >= 0 && ref_set.count(by)) { bad_get(by); } else { if (ref == 0) @@ -244,10 +381,29 @@ class MDSCacheObject { } } + void print_pin_set(ostream& out) { + multiset::iterator it = ref_set.begin(); + while (it != ref_set.end()) { + out << " " << pin_name(*it); + int last = *it; + int c = 1; + do { + it++; + if (it == ref_set.end()) break; + } while (*it == last); + if (c > 1) + out << "*" << c; + } + } // -------------------------------------------- // replication + protected: + map replicas; // [auth] mds -> nonce + int replica_nonce; // [replica] defined on replica + + public: bool is_replicated() { return !replicas.empty(); } bool is_replica(int mds) { return replicas.count(mds); } int num_replicas() { return replicas.size(); } @@ -255,12 +411,12 @@ class MDSCacheObject { if (replicas.count(mds)) return ++replicas[mds]; // inc nonce if (replicas.empty()) - get(MDS_PIN_REPLICATED); + get(PIN_REPLICATED); return replicas[mds] = 1; } void add_replica(int mds, int nonce) { if (replicas.empty()) - get(MDS_PIN_REPLICATED); + get(PIN_REPLICATED); replicas[mds] = nonce; } int get_replica_nonce(int mds) { @@ -271,11 +427,11 @@ class MDSCacheObject { assert(replicas.count(mds)); replicas.erase(mds); if (replicas.empty()) - put(MDS_PIN_REPLICATED); + put(PIN_REPLICATED); } void clear_replicas() { if (!replicas.empty()) - put(MDS_PIN_REPLICATED); + put(PIN_REPLICATED); replicas.clear(); } map::iterator replicas_begin() { return replicas.begin(); } @@ -284,7 +440,89 @@ class MDSCacheObject { int get_replica_nonce() { return replica_nonce;} void set_replica_nonce(int n) { replica_nonce = n; } + + + // --------------------------------------------- + // waiting + protected: + multimap waiting; + + public: + bool is_waiter_for(int mask) { + return waiting.count(mask) > 0; // FIXME: not quite right. + } + void add_waiter(int mask, Context *c) { + if (waiting.empty()) + get(PIN_WAITER); + waiting.insert(pair(mask, c)); + dout(10) << (mdsco_db_line_prefix(this)) + << "add_waiter " << mask << " " << c + << " on " << *this + << endl; + + } + void take_waiting(int mask, list& ls) { + if (waiting.empty()) return; + multimap::iterator it = waiting.begin(); + while (it != waiting.end()) { + if (it->first & mask) { + ls.push_back(it->second); + dout(10) << (mdsco_db_line_prefix(this)) + << "take_waiting mask " << mask << " took " << it->second + << " tag " << it->first + << " on " << *this + << endl; + waiting.erase(it++); + } else { + dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second + << " tag " << it->first + << " on " << *this + << endl; + it++; + } + } + if (waiting.empty()) + put(PIN_WAITER); + } + void finish_waiting(int mask, int result = 0) { + list finished; + take_waiting(mask, finished); + finish_contexts(finished, result); + } + + + // --------------------------------------------- + // locking + // noop unless overloaded. + virtual SimpleLock* get_lock(int type) { assert(0); } + virtual void set_mlock_info(MLock *m) { assert(0); } + virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } + virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } + virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } + virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } + virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } + + + // --------------------------------------------- + // ordering + virtual bool is_lt(const MDSCacheObject *r) const = 0; + struct ptr_lt { + bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { + return l->is_lt(r); + } + }; + }; +inline ostream& operator<<(ostream& out, MDSCacheObject &o) { + o.print(out); + return out; +} + +inline ostream& operator<<(ostream& out, mdsco_db_line_prefix o) { + o.object->print_db_line_prefix(out); + return out; +} + #endif diff --git a/trunk/ceph/messages/MAnchorRequest.h b/trunk/ceph/messages/MAnchor.h similarity index 50% rename from trunk/ceph/messages/MAnchorRequest.h rename to trunk/ceph/messages/MAnchor.h index 2a2d0088978b4..1347aca5e697e 100644 --- a/trunk/ceph/messages/MAnchorRequest.h +++ b/trunk/ceph/messages/MAnchor.h @@ -18,34 +18,38 @@ #include #include "msg/Message.h" -#include "mds/AnchorTable.h" +#include "mds/Anchor.h" -#define ANCHOR_OP_CREATE 1 -#define ANCHOR_OP_DESTROY 2 -#define ANCHOR_OP_LOOKUP 3 -#define ANCHOR_OP_UPDATE 4 -class MAnchorRequest : public Message { +class MAnchor : public Message { int op; inodeno_t ino; - vector trace; + vector trace; + version_t atid; // anchor table version. public: - MAnchorRequest() {} - MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) { - this->op = op; - this->ino = ino; + MAnchor() {} + MAnchor(int o, inodeno_t i, version_t v=0) : + Message(MSG_MDS_ANCHOR), + op(o), ino(i), atid(v) { } + + virtual char *get_type_name() { return "anchor"; } + void print(ostream& o) { + o << "anchor(" << get_anchor_opname(op); + if (ino) o << " " << ino; + if (atid) o << " atid " << atid; + if (!trace.empty()) o << ' ' << trace; + o << ")"; } - ~MAnchorRequest() { - for (unsigned i=0; i& trace) { this->trace = trace; } + void set_trace(vector& trace) { + this->trace = trace; + } int get_op() { return op; } inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } + vector& get_trace() { return trace; } + version_t get_atid() { return atid; } virtual void decode_payload() { int off = 0; @@ -53,23 +57,16 @@ class MAnchorRequest : public Message { off += sizeof(op); payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } + payload.copy(off, sizeof(atid), (char*)&atid); + off += sizeof(atid); + ::_decode(trace, payload, off); } virtual void encode_payload() { payload.append((char*)&op, sizeof(op)); payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); + payload.append((char*)&atid, sizeof(atid)); + ::_encode(trace, payload); } }; diff --git a/trunk/ceph/messages/MAnchorReply.h b/trunk/ceph/messages/MAnchorReply.h deleted file mode 100644 index 0186118f53260..0000000000000 --- a/trunk/ceph/messages/MAnchorReply.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREPLY_H -#define __MANCHORREPLY_H - -#include - -#include "msg/Message.h" -#include "mds/AnchorTable.h" - -#include "MAnchorRequest.h" - - -class MAnchorReply : public Message { - int op; - inodeno_t ino; - vector trace; - - public: - MAnchorReply() {} - MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) { - this->op = req->get_op(); - this->ino = req->get_ino(); - } - ~MAnchorReply() { - for (unsigned i=0; i& trace) { this->trace = trace; } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MCacheExpire.h b/trunk/ceph/messages/MCacheExpire.h index 461d283c23072..9a6b3f1497920 100644 --- a/trunk/ceph/messages/MCacheExpire.h +++ b/trunk/ceph/messages/MCacheExpire.h @@ -11,21 +11,25 @@ * */ - #ifndef __MCACHEEXPIRE_H #define __MCACHEEXPIRE_H class MCacheExpire : public Message { int from; - map inodes; - map dirs; - map > dentries; - public: +public: + /* + group things by realm (auth delgation root), since that's how auth is determined. + that makes it less work to process when exports are in progress. + */ + struct realm { + map inodes; + map dirs; + map > dentries; + }; + map realms; + int get_from() { return from; } - map& get_inodes() { return inodes; } - map& get_dirs() { return dirs; } - map >& get_dentries() { return dentries; } MCacheExpire() {} MCacheExpire(int f) : @@ -34,17 +38,33 @@ class MCacheExpire : public Message { virtual char *get_type_name() { return "CEx";} - void add_inode(inodeno_t ino, int nonce) { - inodes[ino] = nonce; + void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { + realms[r].inodes[ino] = nonce; } - void add_dir(inodeno_t ino, int nonce) { - dirs[ino] = nonce; + void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { + realms[r].dirs[df] = nonce; } - void add_dentry(inodeno_t dirino, const string& dn, int nonce) { - dentries[dirino][dn] = nonce; + void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { + realms[r].dentries[df][dn] = nonce; } - void add_dentries(inodeno_t dirino, map& dmap) { - dentries[dirino] = dmap; + + void add_realm(dirfrag_t df, realm& r) { + realm& myr = realms[df]; + for (map::iterator p = r.inodes.begin(); + p != r.inodes.end(); + ++p) + myr.inodes[p->first] = p->second; + for (map::iterator p = r.dirs.begin(); + p != r.dirs.end(); + ++p) + myr.dirs[p->first] = p->second; + for (map >::iterator p = r.dentries.begin(); + p != r.dentries.end(); + ++p) + for (map::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + myr.dentries[p->first][q->first] = q->second; } void decode_payload() { @@ -53,32 +73,52 @@ class MCacheExpire : public Message { payload.copy(off, sizeof(from), (char*)&from); off += sizeof(from); - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i >::iterator p = dentries.begin(); - p != dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); + + int nr = realms.size(); + payload.append((char*)&nr, sizeof(nr)); + + for (map::iterator q = realms.begin(); + q != realms.end(); + ++q) { + payload.append((char*)&q->first, sizeof(q->first)); + + ::_encode(q->second.inodes, payload); + ::_encode(q->second.dirs, payload); + + int n = q->second.dentries.size(); + payload.append((char*)&n, sizeof(n)); + for (map >::iterator p = q->second.dentries.begin(); + p != q->second.dentries.end(); + ++p) { + payload.append((char*)&p->first, sizeof(p->first)); + ::_encode(p->second, payload); + } } } }; diff --git a/trunk/ceph/messages/MClientFileCaps.h b/trunk/ceph/messages/MClientFileCaps.h index 7fde047b02655..8b1a88cd848f3 100644 --- a/trunk/ceph/messages/MClientFileCaps.h +++ b/trunk/ceph/messages/MClientFileCaps.h @@ -11,27 +11,23 @@ * */ - #ifndef __MCLIENTFILECAPS_H #define __MCLIENTFILECAPS_H -#define CLIENT_FILECAP_RELEASE 1 // mds closed the cap -#define CLIENT_FILECAP_STALE 2 // mds has exported the cap -#define CLIENT_FILECAP_REAP 3 // mds has imported the cap from get_mds() +#include "msg/Message.h" class MClientFileCaps : public Message { public: - static const int FILECAP_RELEASE = 1; - static const int FILECAP_STALE = 2; - static const int FILECAP_REAP = 3; - + static const int OP_ACK = 0; // mds->client or client->mds update. FIXME? + static const int OP_RELEASE = 1; // mds closed the cap + static const int OP_STALE = 2; // mds has exported the cap + static const int OP_REAP = 3; // mds has imported the cap from get_mds() private: inode_t inode; int caps; long seq; int wanted; - //int client; int special; // stale || reap; in conjunction w/ mds value int mds; @@ -42,13 +38,11 @@ class MClientFileCaps : public Message { int get_caps() { return caps; } int get_wanted() { return wanted; } long get_seq() { return seq; } - //int get_client() { return client; } // for cap migration int get_mds() { return mds; } int get_special() { return special; } - //void set_client(int c) { client = c; } void set_caps(int c) { caps = c; } void set_wanted(int w) { wanted = w; } @@ -60,7 +54,7 @@ class MClientFileCaps : public Message { long seq, int caps, int wanted, - int special=0, + int special = OP_ACK, int mds=0) : Message(MSG_CLIENT_FILECAPS) { this->inode = inode; @@ -70,32 +64,32 @@ class MClientFileCaps : public Message { this->special = special; this->mds = mds; } - virtual char *get_type_name() { return "Cfcap";} + + char *get_type_name() { return "Cfcap";} + void print(ostream& out) { + out << "client_file_caps(" << inode.ino + << " seq " << seq + << " caps " << cap_string(caps) + << " wanted" << cap_string(wanted) + << ")"; + } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(seq), (char*)&seq); - off += sizeof(seq); - s.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - s.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - s.copy(off, sizeof(wanted), (char*)&wanted); - off += sizeof(wanted); - //s.copy(off, sizeof(client), (char*)&client); - //off += sizeof(client); - s.copy(off, sizeof(mds), (char*)&mds); - off += sizeof(mds); - s.copy(off, sizeof(special), (char*)&special); - off += sizeof(special); + void decode_payload() { + int off = 0; + ::_decode(seq, payload, off); + ::_decode(inode, payload, off); + ::_decode(caps, payload, off); + ::_decode(wanted, payload, off); + ::_decode(mds, payload, off); + ::_decode(special, payload, off); } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - s.append((char*)&inode, sizeof(inode)); - s.append((char*)&caps, sizeof(caps)); - s.append((char*)&wanted, sizeof(wanted)); - //s.append((char*)&client, sizeof(client)); - s.append((char*)&mds,sizeof(mds)); - s.append((char*)&special,sizeof(special)); + void encode_payload() { + ::_encode(seq, payload); + ::_encode(inode, payload); + ::_encode(caps, payload); + ::_encode(wanted, payload); + ::_encode(mds, payload); + ::_encode(special, payload); } }; diff --git a/trunk/ceph/messages/MClientInodeAuthUpdate.h b/trunk/ceph/messages/MClientInodeAuthUpdate.h deleted file mode 100644 index e9083f6abc575..0000000000000 --- a/trunk/ceph/messages/MClientInodeAuthUpdate.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTINODEAUTHUPDATE_H -#define __MCLIENTINODEAUTHUPDATE_H - -class MClientInodeAuthUpdate : public Message { - inodeno_t ino; - int newauth; - - public: - inodeno_t get_ino() { return ino; } - int get_auth() { return newauth; } - - MClientInodeAuthUpdate() {} - MClientInodeAuthUpdate(inodeno_t ino, int newauth) : - Message(MSG_CLIENT_INODEAUTHUPDATE) { - this->ino = ino; - this->newauth = newauth; - } - virtual char *get_type_name() { return "Ciau";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(newauth), (char*)&newauth); - off += sizeof(newauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&newauth,sizeof(newauth)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MClientMount.h b/trunk/ceph/messages/MClientMount.h index 0684cea8d95c2..e03ce7e97c180 100644 --- a/trunk/ceph/messages/MClientMount.h +++ b/trunk/ceph/messages/MClientMount.h @@ -11,24 +11,19 @@ * */ - #ifndef __MCLIENTMOUNT_H #define __MCLIENTMOUNT_H #include "msg/Message.h" class MClientMount : public Message { +public: + MClientMount() : Message(MSG_CLIENT_MOUNT) { } - public: - MClientMount() : Message(MSG_CLIENT_MOUNT) { - } - - char *get_type_name() { return "Cmnt"; } + char *get_type_name() { return "client_mount"; } - virtual void decode_payload(crope& s, int& off) { - } - virtual void encode_payload(crope& s) { - } + void decode_payload() { } + void encode_payload() { } }; #endif diff --git a/trunk/ceph/messages/MClientMountAck.h b/trunk/ceph/messages/MClientMountAck.h deleted file mode 100644 index 6b1b7cb2a901b..0000000000000 --- a/trunk/ceph/messages/MClientMountAck.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTMOUNTACK_H -#define __MCLIENTMOUNTACK_H - -#include "msg/Message.h" -#include "MClientMount.h" -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" - - -class MClientMountAck : public Message { - long pcid; - bufferlist osd_map_state; - bufferlist mds_map_state; - - public: - MClientMountAck() {} - MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) { - this->pcid = mnt->get_pcid(); - mdsmap->encode( mds_map_state ); - osdmap->encode( osd_map_state ); - } - - bufferlist& get_mds_map_state() { return mds_map_state; } - bufferlist& get_osd_map_state() { return osd_map_state; } - - void set_pcid(long pcid) { this->pcid = pcid; } - long get_pcid() { return pcid; } - - char *get_type_name() { return "CmntA"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(pcid), (char*)&pcid); - off += sizeof(pcid); - ::_decode( mds_map_state, payload, off); - ::_decode( osd_map_state, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&pcid, sizeof(pcid)); - ::_encode( mds_map_state, payload ); - ::_encode( osd_map_state, payload ); - } -}; - -#endif diff --git a/trunk/ceph/messages/MClientReconnect.h b/trunk/ceph/messages/MClientReconnect.h new file mode 100644 index 0000000000000..22f42660d2978 --- /dev/null +++ b/trunk/ceph/messages/MClientReconnect.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MCLIENTRECONNECT_H +#define __MCLIENTRECONNECT_H + +#include "msg/Message.h" +#include "mds/mdstypes.h" + +class MClientReconnect : public Message { +public: + struct inode_caps_t { + __int32_t caps; + __int32_t seq; + __int32_t wanted; + off_t size; + utime_t mtime, atime; + inode_caps_t() {} + inode_caps_t(int c, int s, int w) : + caps(c), seq(s), wanted(w), size(0) {} + inode_caps_t(int c, int s, int w, off_t sz, utime_t mt, utime_t at) : + caps(c), seq(s), wanted(w), size(sz), mtime(mt), atime(at) {} + }; + + map inode_caps; + map inode_path; + bool closed; + + MClientReconnect() : Message(MSG_CLIENT_RECONNECT), + closed(false) { } + + char *get_type_name() { return "client_reconnect"; } + void print(ostream& out) { + out << "client_reconnect(" << inode_caps.size() << " caps)"; + } + + void add_inode_caps(inodeno_t ino, + int havecaps, long seq, int wanted, + off_t sz, utime_t mt, utime_t at) { + inode_caps[ino] = inode_caps_t(havecaps, seq, wanted, sz, mt, at); + } + void add_inode_path(inodeno_t ino, const string& path) { + inode_path[ino] = path; + } + + void encode_payload() { + ::_encode(closed, payload); + ::_encode(inode_caps, payload); + ::_encode(inode_path, payload); + } + void decode_payload() { + int off = 0; + ::_decode(closed, payload, off); + ::_decode(inode_caps, payload, off); + ::_decode(inode_path, payload, off); + } + +}; + + +#endif diff --git a/trunk/ceph/messages/MClientReply.h b/trunk/ceph/messages/MClientReply.h index 6206b909b0c05..874cedbd8bb32 100644 --- a/trunk/ceph/messages/MClientReply.h +++ b/trunk/ceph/messages/MClientReply.h @@ -17,6 +17,8 @@ #include "include/types.h" +#include "MClientRequest.h" + #include "msg/Message.h" #include "mds/CInode.h" #include "mds/CDir.h" @@ -36,12 +38,12 @@ class CInode; * int result - error code, or fh if it was open * * for most requests: - * trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever + * trace is a vector of InodeStat's tracing from root to the file/dir/whatever * the operation referred to, so that the client can update it's info about what * metadata lives on what MDS. * * for readdir replies: - * dir_contents is a vector c_inode_info*'s. + * dir_contents is a vector of InodeStat*'s. * * that's mostly it, i think! * @@ -52,13 +54,12 @@ class InodeStat { public: inode_t inode; string symlink; // symlink content (if symlink) - + fragtree_t dirfragtree; // mds distribution hints - int dir_auth; - bool hashed, replicated; - bool spec_defined; - set dist; // where am i replicated? + map dirfrag_auth; + map > dirfrag_dist; + set dirfrag_rep; public: InodeStat() {} @@ -67,77 +68,65 @@ class InodeStat { { // inode.mask inode.mask = INODE_MASK_BASE; - if (in->filelock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_PERM; - if (in->hardlock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. + if (in->authlock.can_rdlock(0)) inode.mask |= INODE_MASK_AUTH; + if (in->linklock.can_rdlock(0)) inode.mask |= INODE_MASK_LINK; + if (in->filelock.can_rdlock(0)) inode.mask |= INODE_MASK_FILE; // symlink content? if (in->is_symlink()) symlink = in->symlink; + + // dirfragtree + dirfragtree = in->dirfragtree; - // replicated where? - if (in->dir && in->dir->is_auth()) { - spec_defined = true; - in->dir->get_dist_spec(this->dist, whoami); - } else - spec_defined = false; - - if (in->dir) - dir_auth = in->dir->get_dir_auth(); - else - dir_auth = -1; - - // dir info - hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. - replicated = (in->dir && in->dir->is_rep()); + // dirfrag info + list ls; + in->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *dir = *p; + dirfrag_auth[dir->dirfrag().frag] = dir->get_dir_auth().first; + if (dir->is_auth()) + dir->get_dist_spec(dirfrag_dist[dir->dirfrag().frag], whoami); + if (dir->is_rep()) + dirfrag_rep.insert(dir->dirfrag().frag); + } } void _encode(bufferlist &bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&spec_defined, sizeof(spec_defined)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&hashed, sizeof(hashed)); - bl.append((char*)&replicated, sizeof(replicated)); - + ::_encode(inode, bl); + ::_encode(dirfrag_auth, bl); + ::_encode(dirfrag_dist, bl); + ::_encode(dirfrag_rep, bl); ::_encode(symlink, bl); - ::_encode(dist, bl); // distn + dirfragtree._encode(bl); } void _decode(bufferlist &bl, int& off) { - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); - off += sizeof(spec_defined); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(hashed), (char*)&hashed); - off += sizeof(hashed); - bl.copy(off, sizeof(replicated), (char*)&replicated); - off += sizeof(replicated); - + ::_decode(inode, bl, off); + ::_decode(dirfrag_auth, bl, off); + ::_decode(dirfrag_dist, bl, off); + ::_decode(dirfrag_rep, bl, off); ::_decode(symlink, bl, off); - ::_decode(dist, bl, off); + dirfragtree._decode(bl, off); } }; -typedef struct { - long pcid; - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - __uint64_t file_data_version; // for client buffercache consistency - - int _num_trace_in; - int _dir_size; -} MClientReply_st; - class MClientReply : public Message { // reply data - MClientReply_st st; + struct { + long tid; + int op; + int result; // error code + unsigned char file_caps; // for open + long file_caps_seq; + __uint64_t file_data_version; // for client buffercache consistency + + int _num_trace_in; + int _dir_size; + } st; string path; list trace_in; @@ -147,9 +136,6 @@ class MClientReply : public Message { list dir_dn; public: - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - long get_tid() { return st.tid; } int get_op() { return st.op; } @@ -178,7 +164,6 @@ class MClientReply : public Message { MClientReply(MClientRequest *req, int result = 0) : Message(MSG_CLIENT_REPLY) { memset(&st, 0, sizeof(st)); - this->st.pcid = req->get_pcid(); // match up procedure call id!!! this->st.tid = req->get_tid(); this->st.op = req->get_op(); this->path = req->get_path(); @@ -197,7 +182,11 @@ class MClientReply : public Message { delete *it; } virtual char *get_type_name() { return "creply"; } - + void print(ostream& o) { + o << "creply(" << env.dst.name << "." << st.tid; + if (st.result) o << " = " << st.result; + o << ")"; + } // serialization virtual void decode_payload() { diff --git a/trunk/ceph/messages/MClientRequest.h b/trunk/ceph/messages/MClientRequest.h index 9b9ac4e115cac..c26e78520fcf5 100644 --- a/trunk/ceph/messages/MClientRequest.h +++ b/trunk/ceph/messages/MClientRequest.h @@ -15,145 +15,251 @@ #ifndef __MCLIENTREQUEST_H #define __MCLIENTREQUEST_H -#include - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "mds/MDS.h" - /** * * MClientRequest - container for a client METADATA request. created/sent by clients. * can be forwarded around between MDS's. * * int client - the originating client - * long pcid - procedure call id, used to match request+response. * long tid - transaction id, unique among requests for that client. probably just a counter! * -> the MDS passes the Request to the Reply constructor, so this always matches. * * int op - the metadata op code. MDS_OP_RENAME, etc. * int caller_uid, _gid - guess * - * arguments: one or more of these are defined, depending on the metadata op: - * inodeno ino - used by close(), along with fh. not strictly necessary except MDS is currently coded lame. - * filepath path - main file argument (almost everything) - * string sarg - string argument (if a second arg is needed, e.g. rename, symlink) - * int iarg - int arg... file mode for open, fh for close, mode for mkdir, etc. - * int iarg2 - second int arg... gid for chown (iarg is uid) - * time_t targ, targ2 - time args, used by utime - * - * That's basically it! + * fixed size arguments are in a union. + * there's also a string argument, for e.g. symlink(). * */ +#include "msg/Message.h" +#include "include/filepath.h" +#include "mds/mdstypes.h" -typedef struct { - long tid; - int client; - int op; - - entity_inst_t client_inst; +#include +#include +#include +#include +#include + + +// metadata ops. +// >=1000 --> an update, non-idempotent (i.e. an update) +#define MDS_OP_STATFS 1 + +#define MDS_OP_STAT 100 +#define MDS_OP_LSTAT 101 +#define MDS_OP_FSTAT 102 +#define MDS_OP_UTIME 1102 +#define MDS_OP_CHMOD 1104 +#define MDS_OP_CHOWN 1105 - int caller_uid, caller_gid; - inodeno_t ino; +#define MDS_OP_READDIR 200 +#define MDS_OP_MKNOD 1201 +#define MDS_OP_LINK 1202 +#define MDS_OP_UNLINK 1203 +#define MDS_OP_RENAME 1204 - int iarg, iarg2; - time_t targ, targ2; +#define MDS_OP_MKDIR 1220 +#define MDS_OP_RMDIR 1221 +#define MDS_OP_SYMLINK 1222 - inodeno_t mds_wants_replica_in_dirino; +#define MDS_OP_OPEN 301 +#define MDS_OP_TRUNCATE 1306 +#define MDS_OP_FSYNC 307 - size_t sizearg; -} MClientRequest_st; +#define MDS_OP_RELEASE 308 // used only by SyntheticClient op_dist thinger class MClientRequest : public Message { - MClientRequest_st st; + struct { + tid_t tid; + tid_t oldest_client_tid; + int num_fwd; + int retry_attempt; + inodeno_t mds_wants_replica_in_dirino; + + entity_inst_t client_inst; + + int op; + int caller_uid, caller_gid; + } st; + + // path arguments filepath path; string sarg; - string sarg2; - public: - MClientRequest() {} - MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) { + // fixed size arguments. in a union. + // note: nothing with a constructor can go here; use underlying base + // types for _inodeno_t, _frag_t. + union { + struct { + int mask; + } stat; + struct { + _inodeno_t ino; + int mask; + } fstat; + struct { + _frag_t frag; + } readdir; + struct { + _utime_t mtime; + _utime_t atime; + } utime; + struct { + mode_t mode; + } chmod; + struct { + uid_t uid; + gid_t gid; + } chown; + struct { + mode_t mode; + } mknod; + struct { + mode_t mode; + } mkdir; + struct { + int flags; + mode_t mode; + } open; + struct { + _inodeno_t ino; // optional + off_t length; + } truncate; + struct { + _inodeno_t ino; + } fsync; + } args; + + // cons + MClientRequest() : Message(MSG_CLIENT_REQUEST) {} + MClientRequest(int op, entity_inst_t ci) : Message(MSG_CLIENT_REQUEST) { memset(&st, 0, sizeof(st)); + memset(&args, 0, sizeof(args)); this->st.op = op; - this->st.client = client; - this->st.iarg = 0; + this->st.client_inst = ci; + } + + metareqid_t get_reqid() { + // FIXME: for now, assume clients always have 1 incarnation + return metareqid_t(st.client_inst.name.num(), st.tid); + } + + int get_open_file_mode() { + if (args.open.flags & O_LAZY) + return FILE_MODE_LAZY; + if (args.open.flags & O_WRONLY) + return FILE_MODE_W; + if (args.open.flags & O_RDWR) + return FILE_MODE_RW; + if (args.open.flags & O_APPEND) + return FILE_MODE_W; + return FILE_MODE_R; + } + bool open_file_mode_is_readonly() { + return get_open_file_mode() == FILE_MODE_R; + } + bool is_idempotent() { + if (st.op == MDS_OP_OPEN) + return open_file_mode_is_readonly(); + return (st.op < 1000); + } + bool auth_is_best() { + if (!is_idempotent()) return true; + if (st.op == MDS_OP_READDIR) return true; + return false; + } + bool follow_trailing_symlink() { + switch (st.op) { + case MDS_OP_LSTAT: + case MDS_OP_LINK: + case MDS_OP_UNLINK: + case MDS_OP_RENAME: + return false; + + case MDS_OP_STAT: + case MDS_OP_UTIME: + case MDS_OP_CHMOD: + case MDS_OP_CHOWN: + case MDS_OP_READDIR: + case MDS_OP_OPEN: + return true; + + default: + assert(0); + } } - virtual char *get_type_name() { return "creq"; } - // keep a pcid (procedure call id) to match up request+reply - //void set_pcid(long pcid) { this->st.pcid = pcid; } - //long get_pcid() { return st.pcid; } + // normal fields - void set_tid(long t) { st.tid = t; } + void set_tid(tid_t t) { st.tid = t; } + void set_oldest_client_tid(tid_t t) { st.oldest_client_tid = t; } + void inc_num_fwd() { st.num_fwd++; } + void set_retry_attempt(int a) { st.retry_attempt = a; } void set_path(string& p) { path.set_path(p); } void set_path(const char *p) { path.set_path(p); } void set_path(const filepath& fp) { path = fp; } void set_caller_uid(int u) { st.caller_uid = u; } void set_caller_gid(int g) { st.caller_gid = g; } - void set_ino(inodeno_t ino) { st.ino = ino; } - void set_iarg(int i) { st.iarg = i; } - void set_iarg2(int i) { st.iarg2 = i; } - void set_targ(time_t& t) { st.targ = t; } - void set_targ2(time_t& t) { st.targ2 = t; } void set_sarg(string& arg) { this->sarg = arg; } void set_sarg(const char *arg) { this->sarg = arg; } - void set_sarg2(string& arg) { this->sarg2 = arg; } - void set_sizearg(size_t s) { st.sizearg = s; } void set_mds_wants_replica_in_dirino(inodeno_t dirino) { st.mds_wants_replica_in_dirino = dirino; } void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } const entity_inst_t& get_client_inst() { return st.client_inst; } - int get_client() { return st.client; } - long get_tid() { return st.tid; } + int get_client() { return st.client_inst.name.num(); } + tid_t get_tid() { return st.tid; } + tid_t get_oldest_client_tid() { return st.oldest_client_tid; } + int get_num_fwd() { return st.num_fwd; } + int get_retry_attempt() { return st.retry_attempt; } int get_op() { return st.op; } int get_caller_uid() { return st.caller_uid; } int get_caller_gid() { return st.caller_gid; } - inodeno_t get_ino() { return st.ino; } - string& get_path() { return path.get_path(); } + //inodeno_t get_ino() { return st.ino; } + const string& get_path() { return path.get_path(); } filepath& get_filepath() { return path; } - int get_iarg() { return st.iarg; } - int get_iarg2() { return st.iarg2; } - time_t get_targ() { return st.targ; } - time_t get_targ2() { return st.targ2; } string& get_sarg() { return sarg; } - string& get_sarg2() { return sarg2; } - size_t get_sizearg() { return st.sizearg; } inodeno_t get_mds_wants_replica_in_dirino() { return st.mds_wants_replica_in_dirino; } - virtual void decode_payload() { + void decode_payload() { int off = 0; payload.copy(off, sizeof(st), (char*)&st); off += sizeof(st); + payload.copy(off, sizeof(args), (char*)&args); + off += sizeof(args); path._decode(payload, off); - _decode(sarg, payload, off); - _decode(sarg2, payload, off); + ::_decode(sarg, payload, off); } - virtual void encode_payload() { + void encode_payload() { payload.append((char*)&st, sizeof(st)); + payload.append((char*)&args, sizeof(args)); path._encode(payload); - _encode(sarg, payload); - _encode(sarg2, payload); + ::_encode(sarg, payload); } + char *get_type_name() { return "creq"; } void print(ostream& out) { out << "clientreq(client" << get_client() << "." << get_tid() - //<< ".pcid=" << get_pcid() << ":"; switch(get_op()) { + case MDS_OP_STATFS: + out << "statfs"; break; + case MDS_OP_STAT: out << "stat"; break; case MDS_OP_LSTAT: out << "lstat"; break; + case MDS_OP_FSTAT: + out << "fstat"; break; case MDS_OP_UTIME: out << "utime"; break; case MDS_OP_CHMOD: @@ -185,15 +291,18 @@ class MClientRequest : public Message { out << "truncate"; break; case MDS_OP_FSYNC: out << "fsync"; break; - case MDS_OP_RELEASE: - out << "release"; break; + // case MDS_OP_RELEASE: + //out << "release"; break; default: out << "unknown=" << get_op(); + assert(0); } if (get_path().length()) out << "=" << get_path(); if (get_sarg().length()) out << " " << get_sarg(); + if (st.retry_attempt) + out << " RETRY=" << st.retry_attempt; out << ")"; } diff --git a/trunk/ceph/messages/MClientRequestForward.h b/trunk/ceph/messages/MClientRequestForward.h new file mode 100644 index 0000000000000..69e2b889c6d22 --- /dev/null +++ b/trunk/ceph/messages/MClientRequestForward.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTREQUESTFORWARD_H +#define __MCLIENTREQUESTFORWARD_H + +class MClientRequestForward : public Message { + tid_t tid; + int dest_mds; + int num_fwd; + + public: + MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} + MClientRequestForward(tid_t t, int dm, int nf) : + Message(MSG_CLIENT_REQUEST_FORWARD), + tid(t), dest_mds(dm), num_fwd(nf) { } + + tid_t get_tid() { return tid; } + int get_dest_mds() { return dest_mds; } + int get_num_fwd() { return num_fwd; } + + char *get_type_name() { return "cfwd"; } + void print(ostream& o) { + o << "client_request_forward(" << tid + << " to " << dest_mds + << " num_fwd=" << num_fwd + << ")"; + } + + void encode_payload() { + payload.append((char*)&tid, sizeof(tid)); + payload.append((char*)&dest_mds, sizeof(dest_mds)); + payload.append((char*)&num_fwd, sizeof(num_fwd)); + } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(tid), (char*)&tid); + off += sizeof(tid); + payload.copy(off, sizeof(dest_mds), (char*)&dest_mds); + off += sizeof(dest_mds); + payload.copy(off, sizeof(num_fwd), (char*)&num_fwd); + off += sizeof(num_fwd); + } +}; + +#endif diff --git a/trunk/ceph/messages/MClientSession.h b/trunk/ceph/messages/MClientSession.h new file mode 100644 index 0000000000000..dbd05fa92fa2e --- /dev/null +++ b/trunk/ceph/messages/MClientSession.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MCLIENTSESSION_H +#define __MCLIENTSESSION_H + +#include "msg/Message.h" + +class MClientSession : public Message { +public: + const static int OP_OPEN = 1; + const static int OP_OPEN_ACK = 2; + const static int OP_CLOSE = 3; + const static int OP_CLOSE_ACK = 4; + static const char *get_opname(int o) { + switch (o) { + case OP_OPEN: return "open"; + case OP_OPEN_ACK: return "open_ack"; + case OP_CLOSE: return "close"; + case OP_CLOSE_ACK: return "close_ack"; + default: assert(0); + } + } + + __int32_t op; + + MClientSession() : Message(MSG_CLIENT_SESSION) { } + MClientSession(int o) : Message(MSG_CLIENT_SESSION), + op(o) { } + + char *get_type_name() { return "client_session"; } + void print(ostream& out) { + out << "client_session " << get_opname(op); + } + + void decode_payload() { + int off = 0; + ::_decode(op, payload, off); + } + void encode_payload() { + ::_encode(op, payload); + } +}; + +#endif diff --git a/trunk/ceph/messages/MClientBoot.h b/trunk/ceph/messages/MClientUnmount.h similarity index 70% rename from trunk/ceph/messages/MClientBoot.h rename to trunk/ceph/messages/MClientUnmount.h index 460f9f02e27f4..8066caeea3d8b 100644 --- a/trunk/ceph/messages/MClientBoot.h +++ b/trunk/ceph/messages/MClientUnmount.h @@ -11,21 +11,19 @@ * */ - -#ifndef __MCLIENTBOOT_H -#define __MCLIENTBOOT_H +#ifndef __MCLIENTUNMOUNT_H +#define __MCLIENTUNMOUNT_H #include "msg/Message.h" -class MClientBoot : public Message { - - public: - MClientBoot() : Message(MSG_CLIENT_BOOT) { } +class MClientUnmount : public Message { +public: + MClientUnmount() : Message(MSG_CLIENT_UNMOUNT) { } - char *get_type_name() { return "ClientBoot"; } + char *get_type_name() { return "client_unmount"; } - void encode_payload() { } void decode_payload() { } + void encode_payload() { } }; #endif diff --git a/trunk/ceph/messages/MDentryUnlink.h b/trunk/ceph/messages/MDentryUnlink.h index ec1503eeadf00..b1a2580dba4af 100644 --- a/trunk/ceph/messages/MDentryUnlink.h +++ b/trunk/ceph/messages/MDentryUnlink.h @@ -16,29 +16,63 @@ #define __MDENTRYUNLINK_H class MDentryUnlink : public Message { - inodeno_t dirino; + dirfrag_t dirfrag; string dn; public: - inodeno_t get_dirino() { return dirino; } + dirfrag_t get_dirfrag() { return dirfrag; } string& get_dn() { return dn; } + CInodeDiscover *strayin; + CDirDiscover *straydir; + CDentryDiscover *straydn; + MDentryUnlink() {} - MDentryUnlink(inodeno_t dirino, string& dn) : - Message(MSG_MDS_DENTRYUNLINK) { - this->dirino = dirino; - this->dn = dn; + MDentryUnlink(dirfrag_t df, string& n) : + Message(MSG_MDS_DENTRYUNLINK), + dirfrag(df), + dn(n), + strayin(0), straydir(0), straydn(0) { } + ~MDentryUnlink() { + delete strayin; + delete straydir; + delete straydn; + } + + char *get_type_name() { return "dentry_unlink";} + void print(ostream& o) { + o << "dentry_unlink(" << dirfrag << " " << dn << ")"; } - virtual char *get_type_name() { return "Dun";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - _unrope(dn, s, off); + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + ::_decode(dn, payload, off); + + bool isstray; + payload.copy(off, sizeof(isstray), (char*)&isstray); + off += sizeof(isstray); + if (isstray) { + strayin = new CInodeDiscover; + strayin->_decode(payload, off); + straydir = new CDirDiscover; + straydir->_decode(payload, off); + straydn = new CDentryDiscover; + straydn->_decode(payload, off); + } } - virtual void encode_payload(crope& s) { - s.append((char*)&dirino,sizeof(dirino)); - _rope(dn, s); + void encode_payload() { + payload.append((char*)&dirfrag,sizeof(dirfrag)); + ::_encode(dn, payload); + + bool isstray = strayin ? true:false; + payload.append((char*)&isstray, sizeof(isstray)); + if (isstray) { + strayin->_encode(payload); + straydir->_encode(payload); + straydn->_encode(payload); + } } }; diff --git a/trunk/ceph/messages/MDirExpire.h b/trunk/ceph/messages/MDirExpire.h deleted file mode 100644 index a81de3d538365..0000000000000 --- a/trunk/ceph/messages/MDirExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIRE_H -#define __MDIREXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpire_st; - -class MDirExpire : public Message { - MDirExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MDirExpireReq.h b/trunk/ceph/messages/MDirExpireReq.h deleted file mode 100644 index 604a55265c723..0000000000000 --- a/trunk/ceph/messages/MDirExpireReq.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIREREQ_H -#define __MDIREXPIREREQ_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpireReq_st; - -class MDirExpire : public Message { - MDirExpireReq_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIREREQ) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirExR";} - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(st), (char*)&st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MDirUpdate.h b/trunk/ceph/messages/MDirUpdate.h index 9bac721654c22..f7a9c47c161cf 100644 --- a/trunk/ceph/messages/MDirUpdate.h +++ b/trunk/ceph/messages/MDirUpdate.h @@ -17,19 +17,17 @@ #include "msg/Message.h" -typedef struct { - inodeno_t ino; - int dir_rep; - int discover; -} MDirUpdate_st; - class MDirUpdate : public Message { - MDirUpdate_st st; + struct { + dirfrag_t dirfrag; + int dir_rep; + int discover; + } st; set dir_rep_by; string path; public: - inodeno_t get_ino() { return st.ino; } + dirfrag_t get_dirfrag() { return st.dirfrag; } int get_dir_rep() { return st.dir_rep; } set& get_dir_rep_by() { return dir_rep_by; } bool should_discover() { return st.discover > 0; } @@ -40,31 +38,32 @@ class MDirUpdate : public Message { } MDirUpdate() {} - MDirUpdate(inodeno_t ino, + MDirUpdate(dirfrag_t dirfrag, int dir_rep, set& dir_rep_by, string& path, bool discover = false) : Message(MSG_MDS_DIRUPDATE) { - this->st.ino = ino; + this->st.dirfrag = dirfrag; this->st.dir_rep = dir_rep; this->dir_rep_by = dir_rep_by; if (discover) this->st.discover = 5; this->path = path; } - virtual char *get_type_name() { return "dup"; } + virtual char *get_type_name() { return "dir_update"; } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); off += sizeof(st); - _unrope(dir_rep_by, s, off); - _unrope(path, s, off); + ::_decode(dir_rep_by, payload, off); + ::_decode(path, payload, off); } - virtual void encode_payload(crope& r) { - r.append((char*)&st, sizeof(st)); - _rope(dir_rep_by, r); - _rope(path, r); + virtual void encode_payload() { + payload.append((char*)&st, sizeof(st)); + ::_encode(dir_rep_by, payload); + ::_encode(path, payload); } }; diff --git a/trunk/ceph/messages/MDiscover.h b/trunk/ceph/messages/MDiscover.h index d207ab28cc143..8ba24c4cfa0ff 100644 --- a/trunk/ceph/messages/MDiscover.h +++ b/trunk/ceph/messages/MDiscover.h @@ -26,48 +26,72 @@ using namespace std; class MDiscover : public Message { int asker; - inodeno_t base_ino; // 0 -> none, want root + inodeno_t base_ino; // 1 -> root + frag_t base_dir_frag; bool want_base_dir; - bool want_root_inode; - + filepath want; // ... [/]need/this/stuff + inodeno_t want_ino; public: int get_asker() { return asker; } inodeno_t get_base_ino() { return base_ino; } + frag_t get_base_dir_frag() { return base_dir_frag; } filepath& get_want() { return want; } + inodeno_t get_want_ino() { return want_ino; } const string& get_dentry(int n) { return want[n]; } bool wants_base_dir() { return want_base_dir; } + void set_base_dir_frag(frag_t f) { base_dir_frag = f; } + MDiscover() { } MDiscover(int asker, inodeno_t base_ino, filepath& want, - bool want_base_dir = true, - bool want_root_inode = false) : + bool want_base_dir = true) : Message(MSG_MDS_DISCOVER) { this->asker = asker; this->base_ino = base_ino; this->want = want; + want_ino = 0; this->want_base_dir = want_base_dir; - this->want_root_inode = want_root_inode; } - virtual char *get_type_name() { return "Dis"; } + MDiscover(int asker, + dirfrag_t base_dirfrag, + inodeno_t want_ino, + bool want_base_dir = true) : + Message(MSG_MDS_DISCOVER) { + this->asker = asker; + this->base_ino = base_dirfrag.ino; + this->base_dir_frag = base_dirfrag.frag; + this->want_ino = want_ino; + this->want_base_dir = want_base_dir; + } + + char *get_type_name() { return "Dis"; } + void print(ostream &out) { + out << "discover(" << base_ino << "." << base_dir_frag + << " " << want; + if (want_ino) out << want_ino; + out << ")"; + } - virtual void decode_payload(crope& r, int& off) { - r.copy(off, sizeof(asker), (char*)&asker); - off += sizeof(asker); - r.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - r.copy(off, sizeof(bool), (char*)&want_base_dir); - off += sizeof(bool); - want._unrope(r, off); + void decode_payload() { + int off = 0; + ::_decode(asker, payload, off); + ::_decode(base_ino, payload, off); + ::_decode(base_dir_frag, payload, off); + ::_decode(want_base_dir, payload, off); + want._decode(payload, off); + ::_decode(want_ino, payload, off); } - virtual void encode_payload(crope& r) { - r.append((char*)&asker, sizeof(asker)); - r.append((char*)&base_ino, sizeof(base_ino)); - r.append((char*)&want_base_dir, sizeof(want_base_dir)); - want._rope(r); + void encode_payload() { + payload.append((char*)&asker, sizeof(asker)); + payload.append((char*)&base_ino, sizeof(base_ino)); + payload.append((char*)&base_dir_frag, sizeof(base_dir_frag)); + payload.append((char*)&want_base_dir, sizeof(want_base_dir)); + want._encode(payload); + ::_encode(want_ino, payload); } }; diff --git a/trunk/ceph/messages/MDiscoverReply.h b/trunk/ceph/messages/MDiscoverReply.h index c759bc9a76bd1..987be127c46d3 100644 --- a/trunk/ceph/messages/MDiscoverReply.h +++ b/trunk/ceph/messages/MDiscoverReply.h @@ -42,23 +42,28 @@ using namespace std; * error_flag_dn(string) - the specified dentry dne * error_flag_dir - the last item wasn't a dir, so we couldn't continue. * + * and sometimes, + * dir_auth_hint - where we think the dir auth is + * * depth() gives us the number of depth units/indices for which we have * information. this INCLUDES those for which we have errors but no data. * * see MDCache::handle_discover, handle_discover_reply. * - - old crap, maybe not accurate: - - // dir [ + ... ] : discover want_base_dir=true - - // dentry [ + inode [ + ... ] ] : discover want_base_dir=false - // no_base_dir=true - // -> we only exclude inode if dentry is null+xlock - - // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino, - // no_base_dir=no_base_dentry=true - + * + * so basically, we get + * + * dir den ino i + * x 0 + * x x x 1 + * or + * x x 0 + * x x x 1 + * or + * x x x 0 + * x x x 1 + * ...and trail off however we want. + * * */ @@ -67,9 +72,10 @@ class MDiscoverReply : public Message { bool no_base_dir; // no base dir (but IS dentry+inode) bool no_base_dentry; // no base dentry (but IS inode) bool flag_error_dn; + bool flag_error_ino; bool flag_error_dir; string error_dentry; // dentry that was not found (to trigger waiters on asker) - + int dir_auth_hint; vector dirs; // not inode-aligned if no_base_dir = true. vector dentries; // not inode-aligned if no_base_dentry = true @@ -84,6 +90,10 @@ class MDiscoverReply : public Message { int get_num_dentries() { return dentries.size(); } int get_num_dirs() { return dirs.size(); } + int get_last_inode() { return inodes.size(); } + int get_last_dentry() { return dentries.size() + no_base_dentry; } + int get_last_dir() { return dirs.size() + no_base_dir; } + int get_depth() { // return depth of deepest object (in dir/dentry/inode units) return max( inodes.size(), // at least this many max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error @@ -93,24 +103,22 @@ class MDiscoverReply : public Message { bool has_base_dir() { return !no_base_dir && dirs.size(); } bool has_base_dentry() { return !no_base_dentry && dentries.size(); } bool has_root() { - if (base_ino == 0) { - assert(no_base_dir && no_base_dentry); - return true; - } - return false; + return (base_ino == MDS_INO_ROOT && no_base_dir && no_base_dentry); } const string& get_path() { return path; } // bool is_flag_forward() { return flag_forward; } bool is_flag_error_dn() { return flag_error_dn; } + bool is_flag_error_ino() { return flag_error_ino; } bool is_flag_error_dir() { return flag_error_dir; } string& get_error_dentry() { return error_dentry; } + int get_dir_auth_hint() { return dir_auth_hint; } // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } CInodeDiscover& get_inode(int n) { return *(inodes[n]); } + CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } + CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } // cons @@ -121,12 +129,17 @@ class MDiscoverReply : public Message { flag_error_dn = false; flag_error_dir = false; no_base_dir = no_base_dentry = false; + dir_auth_hint = CDIR_AUTH_UNKNOWN; } ~MDiscoverReply() { for (vector::iterator it = dirs.begin(); it != dirs.end(); it++) delete *it; + for (vector::iterator it = dentries.begin(); + it != dentries.end(); + it++) + delete *it; for (vector::iterator it = inodes.begin(); it != inodes.end(); it++) @@ -138,7 +151,8 @@ class MDiscoverReply : public Message { bool is_empty() { return dirs.empty() && dentries.empty() && inodes.empty() && !flag_error_dn && - !flag_error_dir; + !flag_error_dir && + dir_auth_hint == CDIR_AUTH_UNKNOWN; } void add_dentry(CDentryDiscover* ddis) { if (dentries.empty() && dirs.empty()) no_base_dir = true; @@ -161,28 +175,31 @@ class MDiscoverReply : public Message { flag_error_dn = true; error_dentry = dn; } + void set_flag_error_ino() { + flag_error_ino = true; + } void set_flag_error_dir() { flag_error_dir = true; } + void set_dir_auth_hint(int a) { + dir_auth_hint = a; + } + void set_error_dentry(const string& dn) { + error_dentry = dn; + } // ... virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - payload.copy(off, sizeof(bool), (char*)&no_base_dir); - off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&no_base_dentry); - off += sizeof(bool); - // payload.copy(off, sizeof(bool), (char*)&flag_forward); - //off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&flag_error_dn); - off += sizeof(bool); - - _decode(error_dentry, payload, off); - payload.copy(off, sizeof(bool), (char*)&flag_error_dir); - off += sizeof(bool); + ::_decode(base_ino, payload, off); + ::_decode(no_base_dir, payload, off); + ::_decode(no_base_dentry, payload, off); + ::_decode(flag_error_dn, payload, off); + ::_decode(flag_error_ino, payload, off); + ::_decode(flag_error_dir, payload, off); + ::_decode(error_dentry, payload, off); + ::_decode(dir_auth_hint, payload, off); // dirs int n; @@ -212,14 +229,14 @@ class MDiscoverReply : public Message { } } void encode_payload() { - payload.append((char*)&base_ino, sizeof(base_ino)); - payload.append((char*)&no_base_dir, sizeof(bool)); - payload.append((char*)&no_base_dentry, sizeof(bool)); - // payload.append((char*)&flag_forward, sizeof(bool)); - payload.append((char*)&flag_error_dn, sizeof(bool)); - - _encode(error_dentry, payload); - payload.append((char*)&flag_error_dir, sizeof(bool)); + ::_encode(base_ino, payload); + ::_encode(no_base_dir, payload); + ::_encode(no_base_dentry, payload); + ::_encode(flag_error_dn, payload); + ::_encode(flag_error_ino, payload); + ::_encode(flag_error_dir, payload); + ::_encode(error_dentry, payload); + ::_encode(dir_auth_hint, payload); // dirs int n = dirs.size(); diff --git a/trunk/ceph/messages/MExportDir.h b/trunk/ceph/messages/MExportDir.h index 8fdda89466b1e..d8bc40838e8d4 100644 --- a/trunk/ceph/messages/MExportDir.h +++ b/trunk/ceph/messages/MExportDir.h @@ -19,22 +19,25 @@ class MExportDir : public Message { - inodeno_t ino; + dirfrag_t dirfrag; list dirstate; // a bl for reach dir - list exports; + list bounds; public: MExportDir() {} - MExportDir(inodeno_t dirino) : + MExportDir(dirfrag_t df) : Message(MSG_MDS_EXPORTDIR), - ino(dirino) { + dirfrag(df) { } virtual char *get_type_name() { return "Ex"; } + void print(ostream& o) { + o << "export(" << dirfrag << ")"; + } - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } list& get_dirstate() { return dirstate; } - list& get_exports() { return exports; } + list& get_bounds() { return bounds; } void add_dir(bufferlist& dir) { dirstate.push_back(dir); @@ -42,20 +45,20 @@ class MExportDir : public Message { void set_dirstate(const list& ls) { dirstate = ls; } - void add_export(inodeno_t dirino) { - exports.push_back(dirino); + void add_export(dirfrag_t df) { + bounds.push_back(df); } virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - ::_decode(exports, payload, off); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + ::_decode(bounds, payload, off); ::_decode(dirstate, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - ::_encode(exports, payload); + payload.append((char*)&dirfrag, sizeof(dirfrag)); + ::_encode(bounds, payload); ::_encode(dirstate, payload); } diff --git a/trunk/ceph/messages/MExportDirAck.h b/trunk/ceph/messages/MExportDirAck.h index 35691bf94e2a7..5ae7b6e9642f7 100644 --- a/trunk/ceph/messages/MExportDirAck.h +++ b/trunk/ceph/messages/MExportDirAck.h @@ -11,30 +11,33 @@ * */ - #ifndef __MEXPORTDIRACK_H #define __MEXPORTDIRACK_H #include "MExportDir.h" class MExportDirAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirAck() {} - MExportDirAck(MExportDir *req) : - Message(MSG_MDS_EXPORTDIRACK) { - ino = req->get_ino(); - } + MExportDirAck(dirfrag_t i) : + Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } + virtual char *get_type_name() { return "ExAck"; } - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(ino), (char*)&ino); + void print(ostream& o) { + o << "export_ack(" << dirfrag << ")"; + } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/trunk/ceph/messages/MExportDirCancel.h b/trunk/ceph/messages/MExportDirCancel.h new file mode 100644 index 0000000000000..72afe2818a3ad --- /dev/null +++ b/trunk/ceph/messages/MExportDirCancel.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MEXPORTDIRCANCEL_H +#define __MEXPORTDIRCANCEL_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirCancel : public Message { + dirfrag_t dirfrag; + + public: + dirfrag_t get_dirfrag() { return dirfrag; } + + MExportDirCancel() {} + MExportDirCancel(dirfrag_t df) : + Message(MSG_MDS_EXPORTDIRCANCEL), + dirfrag(df) { } + + virtual char *get_type_name() { return "ExCancel"; } + void print(ostream& o) { + o << "export_cancel(" << dirfrag << ")"; + } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + } + + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); + } +}; + +#endif diff --git a/trunk/ceph/messages/MExportDirDiscover.h b/trunk/ceph/messages/MExportDirDiscover.h index 24f77036455f4..a4e609e70cd72 100644 --- a/trunk/ceph/messages/MExportDirDiscover.h +++ b/trunk/ceph/messages/MExportDirDiscover.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRDISCOVER_H #define __MEXPORTDIRDISCOVER_H @@ -20,31 +19,40 @@ #include "include/types.h" class MExportDirDiscover : public Message { - inodeno_t ino; + dirfrag_t dirfrag; string path; public: - inodeno_t get_ino() { return ino; } + inodeno_t get_ino() { return dirfrag.ino; } + dirfrag_t get_dirfrag() { return dirfrag; } string& get_path() { return path; } - MExportDirDiscover() {} - MExportDirDiscover(CInode *in) : - Message(MSG_MDS_EXPORTDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); + bool started; + + MExportDirDiscover() : + Message(MSG_MDS_EXPORTDIRDISCOVER), + started(false) { } + MExportDirDiscover(CDir *dir) : + Message(MSG_MDS_EXPORTDIRDISCOVER), + started(false) { + dir->get_inode()->make_path(path); + dirfrag = dir->dirfrag(); } virtual char *get_type_name() { return "ExDis"; } + void print(ostream& o) { + o << "export_discover(" << dirfrag << " " << path << ")"; + } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - _unrope(path, s, off); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + ::_decode(path, payload, off); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - _rope(path, s); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); + ::_encode(path, payload); } }; diff --git a/trunk/ceph/messages/MExportDirDiscoverAck.h b/trunk/ceph/messages/MExportDirDiscoverAck.h index a25e3b46672e3..44fa0872cf5de 100644 --- a/trunk/ceph/messages/MExportDirDiscoverAck.h +++ b/trunk/ceph/messages/MExportDirDiscoverAck.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRDISCOVERACK_H #define __MEXPORTDIRDISCOVERACK_H @@ -20,32 +19,40 @@ #include "include/types.h" class MExportDirDiscoverAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; bool success; public: - inodeno_t get_ino() { return ino; } + inodeno_t get_ino() { return dirfrag.ino; } + dirfrag_t get_dirfrag() { return dirfrag; } bool is_success() { return success; } MExportDirDiscoverAck() {} - MExportDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "ExDisA"; } + MExportDirDiscoverAck(dirfrag_t df, bool s=true) : + Message(MSG_MDS_EXPORTDIRDISCOVERACK), + dirfrag(df), + success(s) { } + virtual char *get_type_name() { return "ExDisA"; } + void print(ostream& o) { + o << "export_discover_ack(" << dirfrag; + if (success) + o << " success)"; + else + o << " failure)"; + } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(success), (char*)&success); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + payload.copy(off, sizeof(success), (char*)&success); off += sizeof(success); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&success, sizeof(success)); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); + payload.append((char*)&success, sizeof(success)); } }; diff --git a/trunk/ceph/messages/MExportDirFinish.h b/trunk/ceph/messages/MExportDirFinish.h index 89c9e5290c4b2..3ebe2ae759a3b 100644 --- a/trunk/ceph/messages/MExportDirFinish.h +++ b/trunk/ceph/messages/MExportDirFinish.h @@ -11,31 +11,33 @@ * */ - #ifndef __MEXPORTDIRFINISH_H #define __MEXPORTDIRFINISH_H -#include "MExportDir.h" +#include "msg/Message.h" class MExportDirFinish : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirFinish() {} - MExportDirFinish(inodeno_t ino) : + MExportDirFinish(dirfrag_t dirfrag) : Message(MSG_MDS_EXPORTDIRFINISH) { - this->ino = ino; + this->dirfrag = dirfrag; } virtual char *get_type_name() { return "ExFin"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + void print(ostream& o) { + o << "export_finish(" << dirfrag << ")"; + } + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/trunk/ceph/messages/MExportDirNotify.h b/trunk/ceph/messages/MExportDirNotify.h index 9d6532cad478c..686d8052d396a 100644 --- a/trunk/ceph/messages/MExportDirNotify.h +++ b/trunk/ceph/messages/MExportDirNotify.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRNOTIFY_H #define __MEXPORTDIRNOTIFY_H @@ -20,91 +19,65 @@ using namespace std; class MExportDirNotify : public Message { - int new_auth; - int old_auth; - inodeno_t ino; - - list exports; // bounds; these dirs are _not_ included (tho the inodes are) - list subdirs; + dirfrag_t base; + bool ack; + pair old_auth, new_auth; + list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) public: - inodeno_t get_ino() { return ino; } - int get_new_auth() { return new_auth; } - int get_old_auth() { return old_auth; } - list& get_exports() { return exports; } - list::iterator subdirs_begin() { return subdirs.begin(); } - list::iterator subdirs_end() { return subdirs.end(); } - int num_subdirs() { return subdirs.size(); } + dirfrag_t get_dirfrag() { return base; } + pair get_old_auth() { return old_auth; } + pair get_new_auth() { return new_auth; } + bool wants_ack() { return ack; } + list& get_bounds() { return bounds; } MExportDirNotify() {} - MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) : - Message(MSG_MDS_EXPORTDIRNOTIFY) { - this->ino = ino; - this->old_auth = old_auth; - this->new_auth = new_auth; - } + MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : + Message(MSG_MDS_EXPORTDIRNOTIFY), + base(i), ack(a), old_auth(oa), new_auth(na) { } + virtual char *get_type_name() { return "ExNot"; } + void print(ostream& o) { + o << "export_notify(" << base; + o << " " << old_auth << " -> " << new_auth; + if (ack) + o << " ack)"; + else + o << " no ack)"; + } - void copy_subdirs(list& s) { - this->subdirs = s; + void copy_bounds(list& ex) { + this->bounds = ex; } - void copy_exports(list& ex) { - this->exports = ex; + void copy_bounds(set& ex) { + for (set::iterator i = ex.begin(); + i != ex.end(); ++i) + bounds.push_back(*i); } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&new_auth); - off += sizeof(int); - s.copy(off, sizeof(int), (char*)&old_auth); - off += sizeof(int); - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // notify - int n; - s.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i& ex) { + for (set::iterator i = ex.begin(); + i != ex.end(); ++i) + bounds.push_back((*i)->dirfrag()); } - virtual void encode_payload(crope& s) { - s.append((char*)&new_auth, sizeof(int)); - s.append((char*)&old_auth, sizeof(int)); - s.append((char*)&ino, sizeof(ino)); - // notify - int n = exports.size(); - s.append((char*)&n, sizeof(int)); - for (list::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } - - // subdirs - n = subdirs.size(); - s.append((char*)&n, sizeof(int)); - for (list::iterator it = subdirs.begin(); - it != subdirs.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(base), (char*)&base); + off += sizeof(base); + payload.copy(off, sizeof(ack), (char*)&ack); + off += sizeof(ack); + payload.copy(off, sizeof(old_auth), (char*)&old_auth); + off += sizeof(old_auth); + payload.copy(off, sizeof(new_auth), (char*)&new_auth); + off += sizeof(new_auth); + ::_decode(bounds, payload, off); + } + virtual void encode_payload() { + payload.append((char*)&base, sizeof(base)); + payload.append((char*)&ack, sizeof(ack)); + payload.append((char*)&old_auth, sizeof(old_auth)); + payload.append((char*)&new_auth, sizeof(new_auth)); + ::_encode(bounds, payload); } }; diff --git a/trunk/ceph/messages/MExportDirNotifyAck.h b/trunk/ceph/messages/MExportDirNotifyAck.h index 3179fd4f544f1..f53100a2e053c 100644 --- a/trunk/ceph/messages/MExportDirNotifyAck.h +++ b/trunk/ceph/messages/MExportDirNotifyAck.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRNOTIFYACK_H #define __MEXPORTDIRNOTIFYACK_H @@ -20,25 +19,29 @@ using namespace std; class MExportDirNotifyAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirNotifyAck() {} - MExportDirNotifyAck(inodeno_t ino) : + MExportDirNotifyAck(dirfrag_t dirfrag) : Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->ino = ino; + this->dirfrag = dirfrag; } virtual char *get_type_name() { return "ExNotA"; } + void print(ostream& o) { + o << "export_notify_ack(" << dirfrag << ")"; + } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/trunk/ceph/messages/MExportDirPrep.h b/trunk/ceph/messages/MExportDirPrep.h index 6967d950afad9..fce07df7958b7 100644 --- a/trunk/ceph/messages/MExportDirPrep.h +++ b/trunk/ceph/messages/MExportDirPrep.h @@ -20,39 +20,46 @@ #include "include/types.h" class MExportDirPrep : public Message { - inodeno_t ino; + dirfrag_t dirfrag; /* nested export discover payload. not all inodes will have dirs; they may require a separate discover. dentries are the links to each inode. dirs map includes base dir (ino) */ - list exports; + list bounds; list inodes; - map inode_dirino; + map inode_dirfrag; map inode_dentry; - map dirs; + map > frags_by_ino; + map dirfrags; + + set bystanders; bool b_did_assim; public: - inodeno_t get_ino() { return ino; } - list& get_exports() { return exports; } + dirfrag_t get_dirfrag() { return dirfrag; } + list& get_bounds() { return bounds; } list& get_inodes() { return inodes; } - inodeno_t get_containing_dirino(inodeno_t ino) { - return inode_dirino[ino]; + list& get_inode_dirfrags(inodeno_t ino) { + return frags_by_ino[ino]; + } + dirfrag_t get_containing_dirfrag(inodeno_t ino) { + return inode_dirfrag[ino]; } string& get_dentry(inodeno_t ino) { return inode_dentry[ino]; } - bool have_dir(inodeno_t ino) { - return dirs.count(ino); + bool have_dirfrag(dirfrag_t df) { + return dirfrags.count(df); } - CDirDiscover* get_dir(inodeno_t ino) { - return dirs[ino]; + CDirDiscover* get_dirfrag_discover(dirfrag_t df) { + return dirfrags[df]; } + set &get_bystanders() { return bystanders; } bool did_assim() { return b_did_assim; } void mark_assim() { b_did_assim = true; } @@ -60,57 +67,50 @@ class MExportDirPrep : public Message { MExportDirPrep() { b_did_assim = false; } - MExportDirPrep(CInode *in) : - Message(MSG_MDS_EXPORTDIRPREP) { - ino = in->ino(); - b_did_assim = false; - } + MExportDirPrep(dirfrag_t df) : + Message(MSG_MDS_EXPORTDIRPREP), + dirfrag(df), + b_did_assim(false) { } ~MExportDirPrep() { for (list::iterator iit = inodes.begin(); iit != inodes.end(); iit++) delete *iit; - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); + for (map::iterator dit = dirfrags.begin(); + dit != dirfrags.end(); dit++) delete dit->second; } virtual char *get_type_name() { return "ExP"; } + void print(ostream& o) { + o << "export_prep(" << dirfrag << ")"; + } - - - - void add_export(inodeno_t dirino) { - exports.push_back( dirino ); + void add_export(dirfrag_t df) { + bounds.push_back( df ); } - void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) { + void add_inode(dirfrag_t df, const string& dentry, CInodeDiscover *in) { inodes.push_back(in); - inode_dirino.insert(pair(in->get_ino(), dirino)); - inode_dentry.insert(pair(in->get_ino(), dentry)); + inode_dirfrag[in->get_ino()] = df; + inode_dentry[in->get_ino()] = dentry; } - void add_dir(CDirDiscover *dir) { - dirs.insert(pair(dir->get_ino(), dir)); + void add_dirfrag(CDirDiscover *dir) { + dirfrags[dir->get_dirfrag()] = dir; + frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); + } + void add_bystander(int who) { + bystanders.insert(who); } - virtual void decode_payload() { int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + + ::_decode(bounds, payload, off); - // exports - int ne; - payload.copy(off, sizeof(int), (char*)&ne); - off += sizeof(int); - for (int i=0; iget_ino()] = d; // dir ino - inodeno_t dino; - payload.copy(off, sizeof(dino), (char*)&dino); - off += sizeof(dino); - inode_dirino[in->get_ino()] = dino; + dirfrag_t df; + payload.copy(off, sizeof(df), (char*)&df); + off += sizeof(df); + inode_dirfrag[in->get_ino()] = df; + + // child frags + ::_decode(frags_by_ino[in->get_ino()], payload, off); } // dirs @@ -140,22 +143,16 @@ class MExportDirPrep : public Message { for (int i=0; i_decode(payload, off); - dirs[dir->get_ino()] = dir; + dirfrags[dir->get_dirfrag()] = dir; } + + ::_decode(bystanders, payload, off); } virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // exports - int ne = exports.size(); - payload.append((char*)&ne, sizeof(int)); - for (list::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - payload.append((char*)&ino, sizeof(ino)); - } + payload.append((char*)&dirfrag, sizeof(dirfrag)); + + ::_encode(bounds, payload); // inodes int ni = inodes.size(); @@ -169,17 +166,22 @@ class MExportDirPrep : public Message { _encode(inode_dentry[(*iit)->get_ino()], payload); // dir ino - inodeno_t ino = inode_dirino[(*iit)->get_ino()]; - payload.append((char*)&ino, sizeof(ino)); + dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; + payload.append((char*)&df, sizeof(df)); + + // child frags + ::_encode(frags_by_ino[(*iit)->get_ino()], payload); } // dirs - int nd = dirs.size(); + int nd = dirfrags.size(); payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); + for (map::iterator dit = dirfrags.begin(); + dit != dirfrags.end(); dit++) dit->second->_encode(payload); + + ::_encode(bystanders, payload); } }; diff --git a/trunk/ceph/messages/MExportDirPrepAck.h b/trunk/ceph/messages/MExportDirPrepAck.h index c32d7255c5074..38735d263f3e8 100644 --- a/trunk/ceph/messages/MExportDirPrepAck.h +++ b/trunk/ceph/messages/MExportDirPrepAck.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRPREPACK_H #define __MEXPORTDIRPREPACK_H @@ -19,25 +18,28 @@ #include "include/types.h" class MExportDirPrepAck : public Message { - inodeno_t ino; + dirfrag_t dirfrag; public: - inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } MExportDirPrepAck() {} - MExportDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRPREPACK) { - this->ino = ino; - } + MExportDirPrepAck(dirfrag_t df) : + Message(MSG_MDS_EXPORTDIRPREPACK), + dirfrag(df) { } virtual char *get_type_name() { return "ExPAck"; } + void print(ostream& o) { + o << "export_prep_ack(" << dirfrag << ")"; + } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&dirfrag, sizeof(dirfrag)); } }; diff --git a/trunk/ceph/messages/MExportDirWarning.h b/trunk/ceph/messages/MExportDirWarning.h index 6f2fdf55dde4f..40303adf9a139 100644 --- a/trunk/ceph/messages/MExportDirWarning.h +++ b/trunk/ceph/messages/MExportDirWarning.h @@ -11,7 +11,6 @@ * */ - #ifndef __MEXPORTDIRWARNING_H #define __MEXPORTDIRWARNING_H @@ -21,24 +20,29 @@ class MExportDirWarning : public Message { inodeno_t ino; + int new_dir_auth; public: inodeno_t get_ino() { return ino; } + int get_new_dir_auth() { return new_dir_auth; } MExportDirWarning() {} - MExportDirWarning(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRWARNING) { - this->ino = ino; - } + MExportDirWarning(inodeno_t i, int nda) : + Message(MSG_MDS_EXPORTDIRWARNING), + ino(i), new_dir_auth(nda) {} virtual char *get_type_name() { return "ExW"; } - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); + payload.copy(off, sizeof(new_dir_auth), (char*)&new_dir_auth); + off += sizeof(new_dir_auth); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&new_dir_auth, sizeof(new_dir_auth)); } }; diff --git a/trunk/ceph/messages/MExportDirWarningAck.h b/trunk/ceph/messages/MExportDirWarningAck.h new file mode 100644 index 0000000000000..2cefa478befdf --- /dev/null +++ b/trunk/ceph/messages/MExportDirWarningAck.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MEXPORTDIRWARNINGACK_H +#define __MEXPORTDIRWARNINGACK_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirWarningAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirWarningAck() {} + MExportDirWarningAck(inodeno_t i) : + Message(MSG_MDS_EXPORTDIRWARNINGACK), + ino(i) {} + + virtual char *get_type_name() { return "ExWAck"; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } +}; + +#endif diff --git a/trunk/ceph/messages/MFailure.h b/trunk/ceph/messages/MFailure.h deleted file mode 100644 index 0ec53f6e36b18..0000000000000 --- a/trunk/ceph/messages/MFailure.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILURE_H -#define __MFAILURE_H - -#include "msg/Message.h" - - -class MFailure : public Message { - public: - entity_name_t failed; - entity_inst_t inst; - - MFailure() {} - MFailure(entity_name_t f, entity_inst_t& i) : - Message(MSG_FAILURE), - failed(f), inst(i) {} - - entity_name_t get_failed() { return failed; } - entity_inst_t& get_inst() { return inst; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(failed), (char*)&failed); - off += sizeof(failed); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } - void encode_payload() { - payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&inst, sizeof(inst)); - } - - virtual char *get_type_name() { return "fail"; } -}; - -#endif diff --git a/trunk/ceph/messages/MFailureAck.h b/trunk/ceph/messages/MFailureAck.h deleted file mode 100644 index ec0036dcdac55..0000000000000 --- a/trunk/ceph/messages/MFailureAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILUREACK_H -#define __MFAILUREACK_H - -#include "MFailure.h" - - -class MFailureAck : public Message { - public: - entity_name_t failed; - MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) { - this->failed = m->get_failed(); - } - MFailureAck() {} - - entity_name_t get_failed() { return failed; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(failed), (char*)&failed); - off += sizeof(failed); - } - virtual void encode_payload(crope& s) { - s.append((char*)&failed, sizeof(failed)); - } - - virtual char *get_type_name() { return "faila"; } -}; - -#endif diff --git a/trunk/ceph/messages/MHeartbeat.h b/trunk/ceph/messages/MHeartbeat.h index 55455f406ef18..5af9d9dd2d0bf 100644 --- a/trunk/ceph/messages/MHeartbeat.h +++ b/trunk/ceph/messages/MHeartbeat.h @@ -40,40 +40,18 @@ class MHeartbeat : public Message { virtual char *get_type_name() { return "HB"; } - virtual void decode_payload(crope& s, int& off) { - s.copy(off,sizeof(load), (char*)&load); + virtual void decode_payload() { + int off = 0; + payload.copy(off,sizeof(load), (char*)&load); off += sizeof(load); - s.copy(off, sizeof(beat), (char*)&beat); + payload.copy(off, sizeof(beat), (char*)&beat); off += sizeof(beat); - - int n; - s.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - while (n--) { - int f; - s.copy(off, sizeof(f), (char*)&f); - off += sizeof(f); - float v; - s.copy(off, sizeof(v), (char*)&v); - off += sizeof(v); - import_map[f] = v; - } + ::_decode(import_map, payload, off); } - virtual void encode_payload(crope& s) { - s.append((char*)&load, sizeof(load)); - s.append((char*)&beat, sizeof(beat)); - - int n = import_map.size(); - s.append((char*)&n, sizeof(n)); - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - int f = it->first; - s.append((char*)&f, sizeof(f)); - float v = it->second; - s.append((char*)&v, sizeof(v)); - } - + virtual void encode_payload() { + payload.append((char*)&load, sizeof(load)); + payload.append((char*)&beat, sizeof(beat)); + ::_encode(import_map, payload); } }; diff --git a/trunk/ceph/messages/MInodeExpire.h b/trunk/ceph/messages/MInodeExpire.h deleted file mode 100644 index 637f378324022..0000000000000 --- a/trunk/ceph/messages/MInodeExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEEXPIRE_H -#define __MINODEEXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MInodeExpire_st; - -class MInodeExpire : public Message { - MInodeExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MInodeExpire() {} - MInodeExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_INODEEXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "InEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MInodeFileCaps.h b/trunk/ceph/messages/MInodeFileCaps.h index 5bd51be0e347b..397f7f86307dd 100644 --- a/trunk/ceph/messages/MInodeFileCaps.h +++ b/trunk/ceph/messages/MInodeFileCaps.h @@ -37,18 +37,19 @@ class MInodeFileCaps : public Message { virtual char *get_type_name() { return "Icap";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(from), (char*)&from); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(from), (char*)&from); off += sizeof(from); - s.copy(off, sizeof(ino), (char*)&ino); + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - s.copy(off, sizeof(caps), (char*)&caps); + payload.copy(off, sizeof(caps), (char*)&caps); off += sizeof(caps); } - virtual void encode_payload(crope& s) { - s.append((char*)&from, sizeof(from)); - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&caps, sizeof(caps)); + virtual void encode_payload() { + payload.append((char*)&from, sizeof(from)); + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&caps, sizeof(caps)); } }; diff --git a/trunk/ceph/messages/MInodeLink.h b/trunk/ceph/messages/MInodeLink.h index feefc4ea21c7b..3ca0ad6df5438 100644 --- a/trunk/ceph/messages/MInodeLink.h +++ b/trunk/ceph/messages/MInodeLink.h @@ -15,32 +15,67 @@ #ifndef __MINODELINK_H #define __MINODELINK_H -typedef struct { - inodeno_t ino; - int from; -} MInodeLink_st; - class MInodeLink : public Message { - MInodeLink_st st; +public: + static const int OP_PREPARE = 1; + static const int OP_AGREE = 2; + static const int OP_COMMIT = 3; + static const int OP_ACK = 4; + static const int OP_ROLLBACK = 5; + + const char *get_opname(int o) { + switch (o) { + case OP_PREPARE: return "prepare"; + case OP_AGREE: return "agree"; + case OP_COMMIT: return "commit"; + case OP_ACK: return "ack"; + case OP_ROLLBACK: return "rollback"; + default: assert(0); + } + } + +private: + struct _st { + inodeno_t ino; // inode to nlink++ + metareqid_t reqid; // relevant request + int op; // see above + bool inc; // true == ++, false == -- - public: + utime_t ctime; + } st; + +public: inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } + metareqid_t get_reqid() { return st.reqid; } + int get_op() { return st.op; } + bool get_inc() { return st.inc; } + + utime_t get_ctime() { return st.ctime; } + void set_ctime(utime_t ct) { st.ctime = ct; } MInodeLink() {} - MInodeLink(inodeno_t ino, int from) : + MInodeLink(int op, inodeno_t ino, bool inc, metareqid_t ri) : Message(MSG_MDS_INODELINK) { + st.op = op; st.ino = ino; - st.from = from; + st.inc = inc; + st.reqid = ri; + } + + virtual char *get_type_name() { return "inode_link"; } + void print(ostream& o) { + o << "inode_link(" << get_opname(st.op) + << " " << st.ino + << " nlink" << (st.inc ? "++":"--") + << " " << st.reqid << ")"; } - virtual char *get_type_name() { return "InL";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); + virtual void decode_payload() { + int off = 0; + _decoderaw(st, payload, off); } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); + virtual void encode_payload() { + _encode(st, payload); } }; diff --git a/trunk/ceph/messages/MInodeLinkAck.h b/trunk/ceph/messages/MInodeLinkAck.h index 987b70741edcb..f2b984e7b6249 100644 --- a/trunk/ceph/messages/MInodeLinkAck.h +++ b/trunk/ceph/messages/MInodeLinkAck.h @@ -35,12 +35,13 @@ class MInodeLinkAck : public Message { } virtual char *get_type_name() { return "InLA";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); off += sizeof(st); } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); + virtual void encode_payload() { + payload.append((char*)&st,sizeof(st)); } }; diff --git a/trunk/ceph/messages/MInodeUnlink.h b/trunk/ceph/messages/MInodeUnlink.h index e1aa463153c26..32eb4eb20e9ea 100644 --- a/trunk/ceph/messages/MInodeUnlink.h +++ b/trunk/ceph/messages/MInodeUnlink.h @@ -35,12 +35,13 @@ class MInodeUnlink : public Message { } virtual char *get_type_name() { return "InUl";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); off += sizeof(st); } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); + virtual void encode_payload() { + payload.append((char*)&st,sizeof(st)); } }; diff --git a/trunk/ceph/messages/MInodeUnlinkAck.h b/trunk/ceph/messages/MInodeUnlinkAck.h index 283c016f2bec9..45371e6158702 100644 --- a/trunk/ceph/messages/MInodeUnlinkAck.h +++ b/trunk/ceph/messages/MInodeUnlinkAck.h @@ -32,12 +32,13 @@ class MInodeUnlinkAck : public Message { } virtual char *get_type_name() { return "InUlA";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); off += sizeof(st); } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); + virtual void encode_payload() { + payload.append((char*)&st,sizeof(st)); } }; diff --git a/trunk/ceph/messages/MInodeUpdate.h b/trunk/ceph/messages/MInodeUpdate.h deleted file mode 100644 index bbab924089aa5..0000000000000 --- a/trunk/ceph/messages/MInodeUpdate.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUPDATE_H -#define __MINODEUPDATE_H - -#include "msg/Message.h" - -#include -using namespace std; - -class MInodeUpdate : public Message { - int nonce; - crope inode_basic_state; - - public: - inodeno_t get_ino() { - inodeno_t ino; - inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino); - return ino; - } - int get_nonce() { return nonce; } - - MInodeUpdate() {} - MInodeUpdate(CInode *in, int nonce) : - Message(MSG_MDS_INODEUPDATE) { - inode_basic_state = in->encode_basic_state(); - this->nonce = nonce; - } - virtual char *get_type_name() { return "Iup"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&nonce); - off += sizeof(int); - size_t len; - s.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - inode_basic_state = s.substr(off, len); - off += len; - } - virtual void encode_payload(crope& s) { - s.append((char*)&nonce, sizeof(int)); - size_t len = inode_basic_state.length(); - s.append((char*)&len, sizeof(len)); - s.append(inode_basic_state); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MLock.h b/trunk/ceph/messages/MLock.h index 1d22d297d79d4..774e4ddf70b20 100644 --- a/trunk/ceph/messages/MLock.h +++ b/trunk/ceph/messages/MLock.h @@ -17,39 +17,32 @@ #include "msg/Message.h" -#define LOCK_OTYPE_IHARD 1 -#define LOCK_OTYPE_IFILE 2 -#define LOCK_OTYPE_DIR 3 -#define LOCK_OTYPE_DN 4 // for replicas -#define LOCK_AC_SYNC 0 -#define LOCK_AC_MIXED 1 -#define LOCK_AC_LOCK 2 +#define LOCK_AC_SYNC -1 +#define LOCK_AC_MIXED -2 +#define LOCK_AC_LOCK -3 -#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock -#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock -#define LOCK_AC_LOCKNAK 12 // for dentry xlock +#define LOCK_AC_REQXLOCKACK -4 // req dentry xlock +#define LOCK_AC_REQXLOCKNAK -5 // req dentry xlock - -#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) -#define LOCK_AC_FOR_AUTH(a) ((a) >= 11) +#define LOCK_AC_SCATTER -6 // for auth +#define LOCK_AC_SYNCACK 1 +#define LOCK_AC_MIXEDACK 2 +#define LOCK_AC_LOCKACK 3 -#define LOCK_AC_SYNCACK 13 -#define LOCK_AC_MIXEDACK 14 -#define LOCK_AC_LOCKACK 15 - +#define LOCK_AC_REQREAD 4 +#define LOCK_AC_REQWRITE 5 -#define LOCK_AC_REQREAD 19 -#define LOCK_AC_REQWRITE 20 +#define LOCK_AC_REQXLOCK 6 +#define LOCK_AC_UNXLOCK 7 +#define LOCK_AC_FINISH 8 -#define LOCK_AC_REQXLOCK 21 -#define LOCK_AC_REQXLOCKC 22 // create if necessary -#define LOCK_AC_UNXLOCK 23 -#define lock_ac_name(x) +#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) +#define LOCK_AC_FOR_AUTH(a) ((a) > 0) class MLock : public Message { @@ -58,18 +51,22 @@ class MLock : public Message { char otype; // lock object type inodeno_t ino; // ino ref, or possibly + dirfrag_t dirfrag; string dn; // dentry name - bufferlist data; // and possibly some data - string path; // possibly a path too (for dentry lock discovers) + + metareqid_t reqid; // for remote lock requests + + bufferlist data; // and possibly some data public: inodeno_t get_ino() { return ino; } + dirfrag_t get_dirfrag() { return dirfrag; } string& get_dn() { return dn; } bufferlist& get_data() { return data; } int get_asker() { return asker; } int get_action() { return action; } int get_otype() { return otype; } - string& get_path() { return path; } + metareqid_t get_reqid() { return reqid; } MLock() {} MLock(int action, int asker) : @@ -77,27 +74,45 @@ class MLock : public Message { this->action = action; this->asker = asker; } + MLock(SimpleLock *lock, int action, int asker) : + Message(MSG_MDS_LOCK) { + this->otype = lock->get_type(); + lock->get_parent()->set_mlock_info(this); + this->action = action; + this->asker = asker; + } + MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) : + Message(MSG_MDS_LOCK) { + this->otype = lock->get_type(); + lock->get_parent()->set_mlock_info(this); + this->action = action; + this->asker = asker; + data.claim(bl); + } virtual char *get_type_name() { return "ILock"; } void set_ino(inodeno_t ino, char ot) { otype = ot; this->ino = ino; } - void set_dirino(inodeno_t dirino) { - otype = LOCK_OTYPE_DIR; + void set_ino(inodeno_t ino) { this->ino = ino; } - void set_dn(inodeno_t dirino, string& dn) { + /* + void set_dirfrag(dirfrag_t df) { + otype = LOCK_OTYPE_DIR; + this->dirfrag = df; + } + */ + void set_dn(dirfrag_t df, const string& dn) { otype = LOCK_OTYPE_DN; - this->ino = dirino; + this->dirfrag = df; this->dn = dn; } + void set_reqid(metareqid_t ri) { reqid = ri; } void set_data(bufferlist& data) { this->data.claim( data ); } - void set_path(const string& p) { - path = p; - } void decode_payload() { int off = 0; @@ -109,17 +124,20 @@ class MLock : public Message { off += sizeof(otype); payload.copy(off,sizeof(ino), (char*)&ino); off += sizeof(ino); + payload.copy(off,sizeof(dirfrag), (char*)&dirfrag); + off += sizeof(dirfrag); + ::_decode(reqid, payload, off); ::_decode(dn, payload, off); - ::_decode(path, payload, off); ::_decode(data, payload, off); } virtual void encode_payload() { payload.append((char*)&action, sizeof(action)); payload.append((char*)&asker, sizeof(asker)); payload.append((char*)&otype, sizeof(otype)); - payload.append((char*)&ino, sizeof(inodeno_t)); + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&dirfrag, sizeof(dirfrag)); + ::_encode(reqid, payload); ::_encode(dn, payload); - ::_encode(path, payload); ::_encode(data, payload); } diff --git a/trunk/ceph/messages/MMDSCacheRejoin.h b/trunk/ceph/messages/MMDSCacheRejoin.h index 2789e30844743..dc220aacece84 100644 --- a/trunk/ceph/messages/MMDSCacheRejoin.h +++ b/trunk/ceph/messages/MMDSCacheRejoin.h @@ -22,41 +22,161 @@ class MMDSCacheRejoin : public Message { public: - map inodes; // ino -> caps_wanted - set dirs; - map > dentries; // dir -> (dentries...) + static const int OP_REJOIN = 1; // replica -> auth, i exist. and maybe my lock state. + static const int OP_ACK = 3; // auth -> replica, here is your lock state. + static const int OP_MISSING = 4; // auth -> replica, i am missing these items + static const int OP_FULL = 5; // replica -> auth, here is the full object. + static const char *get_opname(int op) { + switch (op) { + case OP_REJOIN: return "rejoin"; + case OP_ACK: return "ack"; + case OP_MISSING: return "missing"; + case OP_FULL: return "full"; + default: assert(0); + } + } + + // -- types -- + struct inode_strong { + __int32_t caps_wanted; + __int32_t nonce; + __int32_t authlock; + __int32_t linklock; + __int32_t dirfragtreelock; + __int32_t filelock; + inode_strong() {} + inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0) : + caps_wanted(cw), + nonce(n), + authlock(a), linklock(l), dirfragtreelock(dft), filelock(f) { } + }; + struct inode_full { + inode_t inode; + string symlink; + fragtree_t dirfragtree; + inode_full() {} + inode_full(const inode_t& i, const string& s, const fragtree_t& f) : + inode(i), symlink(s), dirfragtree(f) {} + inode_full(bufferlist& bl, int& off) { + ::_decode(inode, bl, off); + ::_decode(symlink, bl, off); + ::_decode(dirfragtree, bl, off); + } + void _encode(bufferlist& bl) { + ::_encode(inode, bl); + ::_encode(symlink, bl); + ::_encode(dirfragtree, bl); + } + }; + + struct dirfrag_strong { + __int32_t nonce; + dirfrag_strong() {} + dirfrag_strong(int n) : nonce(n) {} + }; + struct dn_strong { + __int32_t nonce; + __int32_t lock; + dn_strong() {} + dn_strong(int n, int l) : nonce(n), lock(l) {} + }; + + // -- data -- + __int32_t op; + + set weak_inodes; + map strong_inodes; + list full_inodes; + map > xlocked_inodes; + + set weak_dirfrags; + map strong_dirfrags; + + map > weak_dentries; + map > strong_dentries; + map > xlocked_dentries; MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} + MMDSCacheRejoin(int o) : + Message(MSG_MDS_CACHEREJOIN), + op(o) {} char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin" << endl; + out << "cache_rejoin " << get_opname(op); } - void add_dir(inodeno_t dirino) { - dirs.insert(dirino); + // -- builders -- + // inodes + void add_weak_inode(inodeno_t ino) { + weak_inodes.insert(ino); } - void add_dentry(inodeno_t dirino, const string& dn) { - dentries[dirino].insert(dn); + void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f) { + strong_inodes[i] = inode_strong(n, cw, a, l, dft, f); } - void add_inode(inodeno_t ino, int cw) { - inodes[ino] = cw; + void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { + full_inodes.push_back(inode_full(i, s, f)); + } + void add_inode_xlock(inodeno_t ino, int lt, const metareqid_t& ri) { + xlocked_inodes[ino][lt] = ri; } + // dirfrags + void add_weak_dirfrag(dirfrag_t df) { + weak_dirfrags.insert(df); + } + void add_strong_dirfrag(dirfrag_t df, int n) { + strong_dirfrags[df] = dirfrag_strong(n); + } + + // dentries + void add_weak_dentry(dirfrag_t df, const string& dname) { + weak_dentries[df].insert(dname); + } + void add_strong_dentry(dirfrag_t df, const string& dname, int n, int ls) { + strong_dentries[df][dname] = dn_strong(n, ls); + } + void add_dentry_xlock(dirfrag_t df, const string& dname, const metareqid_t& ri) { + xlocked_dentries[df][dname] = ri; + } + + // -- encoding -- void encode_payload() { - ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[*p], payload); + ::_encode(op, payload); + ::_encode(weak_inodes, payload); + ::_encode(strong_inodes, payload); + + __uint32_t nfull = full_inodes.size(); + ::_encode(nfull, payload); + for (list::iterator p = full_inodes.begin(); p != full_inodes.end(); ++p) + p->_encode(payload); + + ::_encode(xlocked_inodes, payload); + ::_encode(weak_dirfrags, payload); + ::_encode(strong_dirfrags, payload); + ::_encode(weak_dentries, payload); + ::_encode(strong_dentries, payload); + ::_encode(xlocked_dentries, payload); } void decode_payload() { int off = 0; - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[*p], payload, off); + ::_decode(op, payload, off); + ::_decode(weak_inodes, payload, off); + ::_decode(strong_inodes, payload, off); + + __uint32_t nfull; + ::_decode(nfull, payload, off); + for (unsigned i=0; i inodes; - map > dentries; - list dirs; + map > dentries; + list dirfrags; MMDSCacheRejoinAck() : Message(MSG_MDS_CACHEREJOINACK) {} char *get_type_name() { return "cache_rejoin_ack"; } - void print(ostream& out) { - out << "cache_rejoin" << endl; + void add_dirfrag(dirfrag_t dirfrag, int nonce) { + dirfrags.push_back(dirinfo(dirfrag,nonce)); } - - void add_dir(inodeno_t dirino, int nonce) { - dirs.push_back(dirinfo(dirino,nonce)); - } - void add_dentry(inodeno_t dirino, const string& dn, int ls, int nonce) { - dentries[dirino][dn] = dninfo(ls, nonce); + void add_dentry(dirfrag_t dirfrag, const string& dn, int ls, int nonce) { + dentries[dirfrag][dn] = dninfo(ls, nonce); } - void add_inode(inodeno_t ino, int hl, int fl, int nonce) { - inodes.push_back(inodeinfo(ino, hl, fl, nonce)); + void add_inode(inodeno_t ino, int authl, int linkl, int dftl, int fl, int nonce) { + inodes.push_back(inodeinfo(ino, authl, linkl, dftl, fl, nonce)); } void encode_payload() { ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[p->dirino], payload); + ::_encode(dirfrags, payload); + for (list::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) + ::_encode(dentries[p->dirfrag], payload); } void decode_payload() { int off = 0; ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[p->dirino], payload, off); - } + ::_decode(dirfrags, payload, off); + for (list::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) + ::_decode(dentries[p->dirfrag], payload, off); + } }; #endif diff --git a/trunk/ceph/messages/MMDSImportMap.h b/trunk/ceph/messages/MMDSImportMap.h index 22774cdabc2ec..abf728878e6c6 100644 --- a/trunk/ceph/messages/MMDSImportMap.h +++ b/trunk/ceph/messages/MMDSImportMap.h @@ -21,8 +21,8 @@ class MMDSImportMap : public Message { public: - map > imap; - map > ambiguous_imap; + map > imap; + map > ambiguous_imap; MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {} @@ -34,14 +34,14 @@ class MMDSImportMap : public Message { << " imports)"; } - void add_import(inodeno_t im) { + void add_import(dirfrag_t im) { imap[im].clear(); } - void add_import_export(inodeno_t im, inodeno_t ex) { - imap[im].insert(ex); + void add_import_export(dirfrag_t im, dirfrag_t ex) { + imap[im].push_back(ex); } - void add_ambiguous_import(inodeno_t im, const set& m) { + void add_ambiguous_import(dirfrag_t im, const list& m) { ambiguous_imap[im] = m; } diff --git a/trunk/ceph/messages/MMDSMap.h b/trunk/ceph/messages/MMDSMap.h index 701ba9a050cc3..7781d6cfd68b6 100644 --- a/trunk/ceph/messages/MMDSMap.h +++ b/trunk/ceph/messages/MMDSMap.h @@ -18,7 +18,6 @@ #include "msg/Message.h" #include "mds/MDSMap.h" - class MMDSMap : public Message { public: /* @@ -59,20 +58,21 @@ class MMDSMap : public Message { mm->encode(encoded); } + char *get_type_name() { return "mdsmap"; } + void print(ostream& out) { + out << "mdsmap(e " << epoch << ")"; + } // marshalling - virtual void decode_payload() { + void decode_payload() { int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); + ::_decode(epoch, payload, off); ::_decode(encoded, payload, off); } - virtual void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); + void encode_payload() { + ::_encode(epoch, payload); ::_encode(encoded, payload); } - - virtual char *get_type_name() { return "mdsmap"; } }; #endif diff --git a/trunk/ceph/messages/MNSLookup.h b/trunk/ceph/messages/MMonCommand.h similarity index 53% rename from trunk/ceph/messages/MNSLookup.h rename to trunk/ceph/messages/MMonCommand.h index b6df663a15a88..f8a41b7b3e03b 100644 --- a/trunk/ceph/messages/MNSLookup.h +++ b/trunk/ceph/messages/MMonCommand.h @@ -11,36 +11,37 @@ * */ - -#ifndef __MNSLOOKUP_H -#define __MNSLOOKUP_H +#ifndef __MMONCOMMAND_H +#define __MMONCOMMAND_H #include "msg/Message.h" -class MNSLookup : public Message { - entity_name_t entity; +#include +using std::vector; +class MMonCommand : public Message { public: - MNSLookup() {} - MNSLookup(entity_name_t e) : - Message(MSG_NS_LOOKUP) { - entity = e; + vector cmd; + + MMonCommand() : Message(MSG_MON_COMMAND) {} + + virtual char *get_type_name() { return "mon_command"; } + void print(ostream& o) { + o << "mon_command("; + for (unsigned i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONCOMMANDACK_H +#define __MMONCOMMANDACK_H + +#include "msg/Message.h" + +class MMonCommandAck : public Message { + public: + int r; + string rs; + + MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {} + MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK), + r(_r), rs(s) { } + + virtual char *get_type_name() { return "mon_command"; } + void print(ostream& o) { + o << "mon_command_ack(" << r << " " << rs << ")"; + } + + void encode_payload() { + payload.append((char*)&r, sizeof(r)); + ::_encode(rs, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(r), (char*)&r); + off += sizeof(r); + ::_decode(rs, payload, off); + } +}; + +#endif diff --git a/trunk/ceph/messages/MNSConnect.h b/trunk/ceph/messages/MNSConnect.h deleted file mode 100644 index 28150f79d8476..0000000000000 --- a/trunk/ceph/messages/MNSConnect.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECT_H -#define __MNSCONNECT_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSConnect : public Message { - tcpaddr_t tcpaddr; - - public: - MNSConnect() {} - MNSConnect(tcpaddr_t t) : - Message(MSG_NS_CONNECT) { - tcpaddr = t; - } - - char *get_type_name() { return "NSCon"; } - - tcpaddr_t& get_addr() { return tcpaddr; } - - void encode_payload() { - payload.append((char*)&tcpaddr, sizeof(tcpaddr)); - } - void decode_payload() { - payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MNSConnectAck.h b/trunk/ceph/messages/MNSConnectAck.h deleted file mode 100644 index 696b13f2a41e6..0000000000000 --- a/trunk/ceph/messages/MNSConnectAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECTACK_H -#define __MNSCONNECTACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSConnectAck : public Message { - int rank; - int inst; - - public: - MNSConnectAck() {} - MNSConnectAck(int r, int g=0) : - Message(MSG_NS_CONNECTACK) { - rank = r; - inst = g; - } - - char *get_type_name() { return "NSConA"; } - - int get_rank() { return rank; } - int get_inst() { return inst; } - - void encode_payload() { - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MNSFailure.h b/trunk/ceph/messages/MNSFailure.h deleted file mode 100644 index 405bfcfd2dacb..0000000000000 --- a/trunk/ceph/messages/MNSFailure.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSFAILURE_H -#define __MNSFAILURE_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSFailure : public Message { - //msg_addr_t entity; - entity_inst_t inst; - - public: - MNSFailure() {} - MNSFailure(entity_inst_t& i) : - Message(MSG_NS_FAILURE), - //entity(w), - inst(i) {} - - char *get_type_name() { return "NSFail"; } - - //msg_addr_t &get_entity() { return entity; } - entity_inst_t &get_inst() { return inst; } - - void encode_payload() { - //payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - //payload.copy(off, sizeof(entity), (char*)&entity); - //off += sizeof(entity); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MNSLookupReply.h b/trunk/ceph/messages/MNSLookupReply.h deleted file mode 100644 index e6720eba397d8..0000000000000 --- a/trunk/ceph/messages/MNSLookupReply.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSLOOKUPREPLY_H -#define __MNSLOOKUPREPLY_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSLookupReply : public Message { - public: - map entity_map; - - public: - MNSLookupReply() {} - MNSLookupReply(MNSLookup *m) : - Message(MSG_NS_LOOKUPREPLY) { - } - - char *get_type_name() { return "NSLookR"; } - - void encode_payload() { - ::_encode(entity_map, payload); - } - void decode_payload() { - int off = 0; - ::_decode(entity_map, payload, off); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MNSRegister.h b/trunk/ceph/messages/MNSRegister.h deleted file mode 100644 index 01d29a2315fa9..0000000000000 --- a/trunk/ceph/messages/MNSRegister.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTER_H -#define __MNSREGISTER_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegister : public Message { - entity_name_t addr; - int rank; - long tid; - - public: - MNSRegister() {} - MNSRegister(entity_name_t a, int r, int ti) : - Message(MSG_NS_REGISTER) { - addr = a; - rank = r; - tid = ti; - } - - char *get_type_name() { return "NSReg"; } - - entity_name_t get_entity() { return addr; } - int get_rank() { return rank; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&addr, sizeof(addr)); - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(addr), (char*)&addr); - off += sizeof(addr); - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MNSRegisterAck.h b/trunk/ceph/messages/MNSRegisterAck.h deleted file mode 100644 index fa2f88ac10e82..0000000000000 --- a/trunk/ceph/messages/MNSRegisterAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTERACK_H -#define __MNSREGISTERACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegisterAck : public Message { - entity_name_t entity; - long tid; - - public: - MNSRegisterAck() {} - MNSRegisterAck(long t, entity_name_t e) : - Message(MSG_NS_REGISTERACK) { - entity = e; - tid = t; - } - - char *get_type_name() { return "NSRegA"; } - - entity_name_t get_entity() { return entity; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(entity), (char*)&entity); - off += sizeof(entity); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/trunk/ceph/messages/MOSDOp.h b/trunk/ceph/messages/MOSDOp.h index d16b02e8aad51..ff82774e7b1e9 100644 --- a/trunk/ceph/messages/MOSDOp.h +++ b/trunk/ceph/messages/MOSDOp.h @@ -75,11 +75,9 @@ public: private: struct { - long pcid; - // who's asking? entity_inst_t client; - reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. + osdreqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. // for replication tid_t rep_tid; @@ -101,6 +99,7 @@ private: bool want_ack; bool want_commit; + bool retry_attempt; } st; bufferlist data; @@ -109,7 +108,7 @@ private: friend class MOSDOpReply; public: - const reqid_t& get_reqid() { return st.reqid; } + const osdreqid_t& get_reqid() { return st.reqid; } const tid_t get_client_tid() { return st.reqid.tid; } int get_client_inc() { return st.reqid.inc; } @@ -120,6 +119,9 @@ private: const tid_t get_rep_tid() { return st.rep_tid; } void set_rep_tid(tid_t t) { st.rep_tid = t; } + bool get_retry_attempt() const { return st.retry_attempt; } + void set_retry_attempt(bool a) { st.retry_attempt = a; } + const object_t get_oid() { return st.oid; } const pg_t get_pg() { return st.pg; } const epoch_t get_map_epoch() { return st.map_epoch; } @@ -156,10 +158,6 @@ private: size_t get_data_len() { return data.length(); } - // keep a pcid (procedure call id) to match up request+reply - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - MOSDOp(entity_inst_t asker, int inc, long tid, object_t oid, pg_t pg, epoch_t mapepoch, int op) : Message(MSG_OSD_OP) { @@ -206,14 +204,13 @@ private: ::_encode(data, payload); } - virtual char *get_type_name() { return "oop"; } - + virtual char *get_type_name() { return "osd_op"; } void print(ostream& out) { out << "osd_op(" << st.reqid << " " << get_opname(st.op) - << " " << st.oid - //<< " " << this - << ")"; + << " " << st.oid; + if (st.retry_attempt) out << " RETRY"; + out << ")"; } }; diff --git a/trunk/ceph/messages/MOSDOpReply.h b/trunk/ceph/messages/MOSDOpReply.h index 05106e096d176..5cb62f077ff2f 100644 --- a/trunk/ceph/messages/MOSDOpReply.h +++ b/trunk/ceph/messages/MOSDOpReply.h @@ -31,7 +31,7 @@ class MOSDOpReply : public Message { struct { // req - reqid_t reqid; + osdreqid_t reqid; tid_t rep_tid; @@ -56,7 +56,7 @@ class MOSDOpReply : public Message { map attrset; public: - const reqid_t& get_reqid() { return st.reqid; } + const osdreqid_t& get_reqid() { return st.reqid; } long get_tid() { return st.reqid.tid; } long get_rep_tid() { return st.rep_tid; } object_t get_oid() { return st.oid; } @@ -137,9 +137,13 @@ public: void print(ostream& out) { out << "osd_op_reply(" << st.reqid << " " << MOSDOp::get_opname(st.op) - << " " << st.oid << " = " << st.result - //<< " " << this - << ")"; + << " " << st.oid; + if (st.commit) + out << " commit"; + else + out << " ack"; + out << " = " << st.result; + out << ")"; } }; diff --git a/trunk/ceph/messages/MPing.h b/trunk/ceph/messages/MPing.h index 65b65a738cd66..a4e8d02ce19c7 100644 --- a/trunk/ceph/messages/MPing.h +++ b/trunk/ceph/messages/MPing.h @@ -27,12 +27,13 @@ class MPing : public Message { } MPing() : Message(MSG_PING) {} - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); + virtual void decode_payload() { + int off = 0; + payload.copy(0, sizeof(seq), (char*)&seq); off += sizeof(seq); } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); + virtual void encode_payload() { + payload.append((char*)&seq, sizeof(seq)); } virtual char *get_type_name() { return "ping"; } diff --git a/trunk/ceph/messages/MPingAck.h b/trunk/ceph/messages/MPingAck.h index 0ee385b7a2b80..88472f16766a9 100644 --- a/trunk/ceph/messages/MPingAck.h +++ b/trunk/ceph/messages/MPingAck.h @@ -26,12 +26,13 @@ class MPingAck : public Message { this->seq = p->seq; } - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); + virtual void decode_payload() { + int off = 0; + payload.copy(0, sizeof(seq), (char*)&seq); off += sizeof(seq); } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); + virtual void encode_payload() { + payload.append((char*)&seq, sizeof(seq)); } virtual char *get_type_name() { return "pinga"; } diff --git a/trunk/ceph/messages/MRenameAck.h b/trunk/ceph/messages/MRenameAck.h index 14843cef5f616..d81e357d6bf6a 100644 --- a/trunk/ceph/messages/MRenameAck.h +++ b/trunk/ceph/messages/MRenameAck.h @@ -30,12 +30,13 @@ class MRenameAck : public Message { } virtual char *get_type_name() { return "RnAck";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&ino,sizeof(ino)); } }; diff --git a/trunk/ceph/messages/MRenameNotify.h b/trunk/ceph/messages/MRenameNotify.h index bc32300b82e3a..600e430048696 100644 --- a/trunk/ceph/messages/MRenameNotify.h +++ b/trunk/ceph/messages/MRenameNotify.h @@ -53,27 +53,28 @@ class MRenameNotify : public Message { } virtual char *get_type_name() { return "Rnot";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); + payload.copy(off, sizeof(destdirino), (char*)&destdirino); off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destdirpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); + ::_decode(srcname, payload, off); + ::_decode(destname, payload, off); + ::_decode(destdirpath, payload, off); + payload.copy(off, sizeof(srcauth), (char*)&srcauth); off += sizeof(srcauth); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destdirpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); + virtual void encode_payload() { + payload.append((char*)&ino,sizeof(ino)); + payload.append((char*)&srcdirino,sizeof(srcdirino)); + payload.append((char*)&destdirino,sizeof(destdirino)); + ::_encode(srcname, payload); + ::_encode(destname, payload); + ::_encode(destdirpath, payload); + payload.append((char*)&srcauth, sizeof(srcauth)); } }; diff --git a/trunk/ceph/messages/MRenameNotifyAck.h b/trunk/ceph/messages/MRenameNotifyAck.h index d1a01339cd97a..e03d6d16c3498 100644 --- a/trunk/ceph/messages/MRenameNotifyAck.h +++ b/trunk/ceph/messages/MRenameNotifyAck.h @@ -28,12 +28,13 @@ class MRenameNotifyAck : public Message { } virtual char *get_type_name() { return "RnotA";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&ino,sizeof(ino)); } }; diff --git a/trunk/ceph/messages/MRenamePrep.h b/trunk/ceph/messages/MRenamePrep.h index 1af798c674489..acee82f3a757a 100644 --- a/trunk/ceph/messages/MRenamePrep.h +++ b/trunk/ceph/messages/MRenamePrep.h @@ -56,29 +56,30 @@ class MRenamePrep : public Message { } virtual char *get_type_name() { return "RnP";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(initiator), (char*)&initiator); off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); + payload.copy(off, sizeof(destdirino), (char*)&destdirino); off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(srcpath, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); + ::_decode(srcname, payload, off); + ::_decode(srcpath, payload, off); + ::_decode(destname, payload, off); + ::_decode(destpath, payload, off); + payload.copy(off, sizeof(srcauth), (char*)&srcauth); off += sizeof(srcauth); } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(srcpath, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); + virtual void encode_payload() { + payload.append((char*)&initiator,sizeof(initiator)); + payload.append((char*)&srcdirino,sizeof(srcdirino)); + payload.append((char*)&destdirino,sizeof(destdirino)); + ::_encode(srcname, payload); + ::_encode(srcpath, payload); + ::_encode(destname, payload); + ::_encode(destpath, payload); + payload.append((char*)&srcauth, sizeof(srcauth)); } }; diff --git a/trunk/ceph/messages/MRenameReq.h b/trunk/ceph/messages/MRenameReq.h index b70e96a38203b..10d5f6135e43a 100644 --- a/trunk/ceph/messages/MRenameReq.h +++ b/trunk/ceph/messages/MRenameReq.h @@ -52,27 +52,28 @@ class MRenameReq : public Message { } virtual char *get_type_name() { return "RnReq";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(initiator), (char*)&initiator); off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); + payload.copy(off, sizeof(destdirino), (char*)&destdirino); off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(destauth), (char*)&destauth); + ::_decode(srcname, payload, off); + ::_decode(destname, payload, off); + ::_decode(destpath, payload, off); + payload.copy(off, sizeof(destauth), (char*)&destauth); off += sizeof(destauth); } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&destauth, sizeof(destauth)); + virtual void encode_payload() { + payload.append((char*)&initiator,sizeof(initiator)); + payload.append((char*)&srcdirino,sizeof(srcdirino)); + payload.append((char*)&destdirino,sizeof(destdirino)); + ::_encode(srcname, payload); + ::_encode(destname, payload); + ::_encode(destpath, payload); + payload.append((char*)&destauth, sizeof(destauth)); } }; diff --git a/trunk/ceph/messages/MRenameWarning.h b/trunk/ceph/messages/MRenameWarning.h index 85463dfd2c179..3dc7ad8733332 100644 --- a/trunk/ceph/messages/MRenameWarning.h +++ b/trunk/ceph/messages/MRenameWarning.h @@ -28,12 +28,13 @@ class MRenameWarning : public Message { } virtual char *get_type_name() { return "RnW";} - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); + virtual void encode_payload() { + payload.append((char*)&ino,sizeof(ino)); } }; diff --git a/trunk/ceph/mon/ClientMonitor.cc b/trunk/ceph/mon/ClientMonitor.cc index 8ab59504d4bae..08163ee540aa3 100644 --- a/trunk/ceph/mon/ClientMonitor.cc +++ b/trunk/ceph/mon/ClientMonitor.cc @@ -15,10 +15,10 @@ #include "ClientMonitor.h" #include "Monitor.h" #include "MDSMonitor.h" +#include "OSDMonitor.h" -#include "messages/MClientBoot.h" -#include "messages/MMDSMap.h" -//#include "messages/MMDSFailure.h" +#include "messages/MClientMount.h" +#include "messages/MClientUnmount.h" #include "common/Timer.h" @@ -34,43 +34,67 @@ void ClientMonitor::dispatch(Message *m) { switch (m->get_type()) { - case MSG_CLIENT_BOOT: - handle_client_boot((MClientBoot*)m); + case MSG_CLIENT_MOUNT: + handle_client_mount((MClientMount*)m); + break; + + case MSG_CLIENT_UNMOUNT: + handle_client_unmount((MClientUnmount*)m); break; - /* - case MSG_client_FAILURE: - handle_client_failure((MClientFailure*)m); - break; - */ - + default: assert(0); } } -void ClientMonitor::handle_client_boot(MClientBoot *m) +void ClientMonitor::handle_client_mount(MClientMount *m) { - dout(7) << "client_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; + dout(7) << "client_mount from " << m->get_source_inst() << endl; assert(m->get_source().is_client()); int from = m->get_source().num(); // choose a client id if (from < 0 || - (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_addr())) { - from = ++num_clients; - dout(10) << "client_boot assigned client" << from << endl; + (client_map.count(from) && + client_map[from] != m->get_source_addr())) { + from = num_clients++; + dout(10) << "client_mount assigned client" << from << endl; } - - client_map[MSG_ADDR_CLIENT(from)] = m->get_source_addr(); - + + client_map[from] = m->get_source_addr(); + // reply with latest mds map entity_inst_t to = m->get_source_inst(); to.name = MSG_ADDR_CLIENT(from); mon->mdsmon->send_latest(to); + mon->osdmon->send_latest(to); delete m; } +void ClientMonitor::handle_client_unmount(MClientUnmount *m) +{ + dout(7) << "client_unmount from " << m->get_source() + << " at " << m->get_source_inst() << endl; + assert(m->get_source().is_client()); + int from = m->get_source().num(); + + if (client_map.count(from)) { + client_map.erase(from); + + if (client_map.empty() && + g_conf.mds_shutdown_on_last_unmount) { + dout(1) << "last client unmounted" << endl; + mon->do_stop(); + } + } + + // reply with (same) unmount message to ack + mon->messenger->send_message(m, m->get_source_inst()); +} + + + /* void ClientMonitor::handle_mds_shutdown(Message *m) { diff --git a/trunk/ceph/mon/ClientMonitor.h b/trunk/ceph/mon/ClientMonitor.h index c3ea253bafc48..9ecd180b1fe74 100644 --- a/trunk/ceph/mon/ClientMonitor.h +++ b/trunk/ceph/mon/ClientMonitor.h @@ -32,14 +32,15 @@ class ClientMonitor : public Dispatcher { private: int num_clients; - map client_map; + map client_map; void bcast_latest_mds(); //void accept_pending(); // accept pending, new map. //void send_incremental(epoch_t since, msg_addr_t dest); - void handle_client_boot(class MClientBoot *m); + void handle_client_mount(class MClientMount *m); + void handle_client_unmount(class MClientUnmount *m); public: ClientMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l), diff --git a/trunk/ceph/mon/MDSMonitor.cc b/trunk/ceph/mon/MDSMonitor.cc index 24beadf85e9f0..bc65e0828916e 100644 --- a/trunk/ceph/mon/MDSMonitor.cc +++ b/trunk/ceph/mon/MDSMonitor.cc @@ -20,8 +20,12 @@ #include "messages/MMDSGetMap.h" #include "messages/MMDSBeacon.h" +#include "messages/MMonCommand.h" + #include "common/Timer.h" +#include + #include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " @@ -69,7 +73,7 @@ void MDSMonitor::election_finished() void MDSMonitor::create_initial() { mdsmap.epoch = 0; // until everyone boots - mdsmap.ctime = g_clock.now(); + mdsmap.created = g_clock.now(); mdsmap.encode(encoded_map); @@ -94,7 +98,7 @@ void MDSMonitor::save_map() void MDSMonitor::print_map() { - dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; + dout(7) << "print_map epoch " << mdsmap.get_epoch() << " num_mds " << g_conf.num_mds << endl; entity_inst_t blank; set all; mdsmap.get_mds_set(all); @@ -108,7 +112,6 @@ void MDSMonitor::print_map() } } - void MDSMonitor::issue_map() { mdsmap.inc_epoch(); @@ -126,9 +129,40 @@ void MDSMonitor::issue_map() } +void MDSMonitor::handle_command(MMonCommand *m, int& r, string& rs) +{ + stringstream ss; + if (m->cmd.size() > 1) { + if (m->cmd[1] == "stop" && m->cmd.size() > 2) { + int who = atoi(m->cmd[2].c_str()); + if (mdsmap.is_active(who)) { + r = 0; + ss << "telling mds" << who << " to stop"; + getline(ss,rs); + + // hack + mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; + issue_map(); + + } else { + ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; + getline(ss,rs); + } + } + else if (m->cmd[1] == "setnum" && m->cmd.size() > 2) { + g_conf.num_mds = atoi(m->cmd[2].c_str()); + ss << "g_conf.num_mds = " << g_conf.num_mds << endl; + getline(ss,rs); + print_map(); + } + } +} + + + void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) { - dout(7) << "mds_beacon " << *m + dout(12) << "mds_beacon " << *m << " from " << m->get_source() << " " << m->get_source_inst() << endl; @@ -195,12 +229,21 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); mdsmap.mds_inc[from]++; + // someone (new) joined the cluster + mdsmap.same_inst_since = mdsmap.epoch+1; + // starting -> creating|starting|replay if (mdsmap.is_degraded() && !mdsmap.is_failed(from)) { dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl; state = MDSMap::STATE_STANDBY; + } + /* + else if (from >= g_conf.num_mds) { + dout(10) << "mds_beacon already have " << g_conf.num_mds << " mds's, standby (increase with 'mds setnum xxx')" << endl; + state = MDSMap::STATE_STANDBY; } + */ else if (state == MDSMap::STATE_STARTING) { if (mdsmap.is_failed(from)) { dout(10) << "mds_beacon will recover mds" << from << endl; @@ -236,6 +279,11 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) << " -> " << MDSMap::get_state_name(state) << endl; + // did someone leave the cluster? + if (state == MDSMap::STATE_OUT && mdsmap.mds_state[from] != MDSMap::STATE_OUT) + mdsmap.same_inst_since = mdsmap.epoch+1; + + // change the state mdsmap.mds_state[from] = state; if (mdsmap.is_up(from)) mdsmap.mds_state_seq[from] = seq; @@ -333,6 +381,7 @@ void MDSMonitor::tick() break; case MDSMap::STATE_REPLAY: + case MDSMap::STATE_RESOLVE: case MDSMap::STATE_REJOIN: case MDSMap::STATE_ACTIVE: case MDSMap::STATE_STOPPING: @@ -368,3 +417,15 @@ void MDSMonitor::tick() } } } + + +void MDSMonitor::do_stop() +{ + for (map::iterator p = mdsmap.mds_state.begin(); + p != mdsmap.mds_state.end(); + ++p) + if (mdsmap.is_active(p->first)) + mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; + + issue_map(); +} diff --git a/trunk/ceph/mon/MDSMonitor.h b/trunk/ceph/mon/MDSMonitor.h index c3bc3d165883c..00df0da443788 100644 --- a/trunk/ceph/mon/MDSMonitor.h +++ b/trunk/ceph/mon/MDSMonitor.h @@ -82,6 +82,10 @@ class MDSMonitor : public Dispatcher { void send_latest(entity_inst_t dest); + void handle_command(class MMonCommand *m, int& r, string& rs); + + void do_stop(); + }; #endif diff --git a/trunk/ceph/mon/Monitor.cc b/trunk/ceph/mon/Monitor.cc index 8bf1d2f0cfe21..418f330ddd7ac 100644 --- a/trunk/ceph/mon/Monitor.cc +++ b/trunk/ceph/mon/Monitor.cc @@ -25,6 +25,8 @@ #include "messages/MPing.h" #include "messages/MPingAck.h" #include "messages/MGenericMessage.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" #include "messages/MMonPaxos.h" @@ -161,6 +163,39 @@ void Monitor::lose_election(int l) } +void Monitor::handle_command(MMonCommand *m) +{ + dout(0) << "handle_command " << *m << endl; + + int r = -1; + string rs = "unrecognized command"; + + if (!m->cmd.empty()) { + if (m->cmd[0] == "stop") { + r = 0; + rs = "stopping"; + do_stop(); + } + else if (m->cmd[0] == "mds") { + mdsmon->handle_command(m, r, rs); + } + else if (m->cmd[0] == "osd") { + + } + } + + // reply + messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); + delete m; +} + + +void Monitor::do_stop() +{ + dout(0) << "do_stop -- shutting down" << endl; + mdsmon->do_stop(); +} + void Monitor::dispatch(Message *m) { @@ -178,6 +213,10 @@ void Monitor::dispatch(Message *m) osdmon->dispatch(m); break; + case MSG_MON_COMMAND: + handle_command((MMonCommand*)m); + break; + // OSDs case MSG_OSD_GETMAP: @@ -202,7 +241,8 @@ void Monitor::dispatch(Message *m) break; // clients - case MSG_CLIENT_BOOT: + case MSG_CLIENT_MOUNT: + case MSG_CLIENT_UNMOUNT: clientmon->dispatch(m); break; diff --git a/trunk/ceph/mon/Monitor.h b/trunk/ceph/mon/Monitor.h index 6554ad36239b1..34c30668613a0 100644 --- a/trunk/ceph/mon/Monitor.h +++ b/trunk/ceph/mon/Monitor.h @@ -96,6 +96,7 @@ protected: // messages void handle_shutdown(Message *m); void handle_ping_ack(class MPingAck *m); + void handle_command(class MMonCommand *m); friend class OSDMonitor; friend class MDSMonitor; @@ -134,6 +135,8 @@ protected: void dispatch(Message *m); void tick(); + void do_stop(); + }; #endif diff --git a/trunk/ceph/mon/OSDMonitor.h b/trunk/ceph/mon/OSDMonitor.h index bf393f17d9f7a..d8350b4ee62b4 100644 --- a/trunk/ceph/mon/OSDMonitor.h +++ b/trunk/ceph/mon/OSDMonitor.h @@ -102,6 +102,10 @@ private: void mark_all_down(); + void send_latest(entity_inst_t i) { + send_full(i); + } + void fake_osd_failure(int osd, bool down); void fake_osdmap_update(); void fake_reorg(); diff --git a/trunk/ceph/msg/FakeMessenger.cc b/trunk/ceph/msg/FakeMessenger.cc index 2aa6c6b06b75b..6c563cf58078f 100644 --- a/trunk/ceph/msg/FakeMessenger.cc +++ b/trunk/ceph/msg/FakeMessenger.cc @@ -157,6 +157,7 @@ int fakemessenger_do_loop_2() dout(1) << "---- " << m->get_dest() << " <- " << m->get_source() << " ---- " << *m + << " ---- " << m << endl; if (g_conf.fakemessenger_serialize) { @@ -310,11 +311,13 @@ int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fr // queue if (directory.count(inst.addr)) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl; + dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m + << endl; directory[inst.addr]->queue_incoming(m); } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m - << " *** destination DNE ***" << endl; + dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m + << " *** destination DNE ***" + << endl; for (map::iterator p = directory.begin(); p != directory.end(); ++p) { diff --git a/trunk/ceph/msg/MPIMessenger.cc b/trunk/ceph/msg/MPIMessenger.cc deleted file mode 100644 index 6c4e65d063fc9..0000000000000 --- a/trunk/ceph/msg/MPIMessenger.cc +++ /dev/null @@ -1,608 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "MPIMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -#include - -/* - * We make a directory, so that we can have multiple Messengers in the - * same process (rank). This is useful for benchmarking and creating lots of - * simulated clients, e.g. - */ - -hash_map directory; -list outgoing, incoming; -list unfinished_sends; -map unfinished_send_message; - -/* this process */ -int mpi_world; -int mpi_rank; -bool mpi_done = false; // set this flag to stop the event loop - - -#define FUNNEL_MPI // if we want to funnel mpi through a single thread -#define TAG_UNSOLICITED 0 -#define DBLVL 18 - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; -pthread_t thread_id = 0; // thread id of the event loop. init value == nobody - -Mutex sender_lock; -Mutex out_queue_lock; - -bool pending_timer; - - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -//static pthread_mutex_t mutex; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -//static int nthreads = 10; - -//#define TAG_UNSOLICITED 0 - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] " - - - -/***** - * MPI global methods for process-wide startup, shutdown. - */ - -int mpimessenger_init(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - dout(12) << "init: i am " << hostname << " pid " << pid << endl; - - assert(mpi_world > g_conf.num_osd+g_conf.num_mds); - - return mpi_rank; -} - -int mpimessenger_shutdown() -{ - dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl; - MPI_Barrier (MPI_COMM_WORLD); - dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl; - MPI_Finalize(); - return 0; -} - -int mpimessenger_world() -{ - return mpi_world; -} - - - -/*** - * internal send/recv - */ - - -/* - * get fresh MPI_Request* (on heap) for a new async MPI_Isend - */ - -MPI_Request *mpi_prep_send_req() { - MPI_Request *req = new MPI_Request; - unfinished_sends.push_back(req); - dout(DBLVL) << "prep_send_req " << req << endl; - return req; -} - - -/* - * clean up MPI_Request*'s for Isends that have completed. - * also, hose any associated Message*'s for Messages that are completely sent. - * - * if wait=true, block and wait for sends to finish. - */ - -void mpi_reap_sends(bool wait=false) { - sender_lock.Lock(); - - list::iterator it = unfinished_sends.begin(); - while (it != unfinished_sends.end()) { - MPI_Status status; - int flag; - - if (wait) { - MPI_Wait(*it, &status); - } else { - MPI_Test(*it, &flag, &status); - if (!flag) break; // not finished yet - } - - dout(DBLVL) << "send " << *it << " completed" << endl; - - if (unfinished_send_message.count(*it)) { - dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl; - delete unfinished_send_message[*it]; - unfinished_send_message.erase(*it); - } - - delete *it; - it++; - unfinished_sends.pop_front(); - } - - dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl; - - sender_lock.Unlock(); -} - - -void mpi_finish_sends() { - mpi_reap_sends(true); -} - - -/* - * recv a Message* - */ -Message *mpi_recv(int tag) -{ - // envelope - dout(DBLVL) << "mpi_recv waiting for message tag " << tag << endl; - - MPI_Status status; - msg_envelope_t env; - - ASSERT(MPI_Recv((void*)&env, - sizeof(env), - MPI_CHAR, - MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status/*, - &recv_env_req*/) == MPI_SUCCESS); - assert(status.count == MSG_ENVELOPE_LEN); - - if (env.type == 0) { - dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl; - return 0; - } - - dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl; - - // payload - bufferlist blist; - for (int i=0; iget_dest(), mpi_world); - - // local? - if (rank == mpi_rank) { - dout(DBLVL) << "queuing local delivery" << endl; - incoming.push_back(m); - return 0; - } - - // marshall - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t *env = &m->get_envelope(); - env->nchunks = m->get_payload().buffers().size(); - - dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl; - -#ifndef FUNNEL_MPI - sender_lock.Lock(); -#endif - - // send envelope - ASSERT(MPI_Isend((void*)env, - sizeof(*env), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - - // payload - int i = 0; - for (list::iterator it = m->get_payload().buffers().begin(); - it != m->get_payload().buffers().end(); - it++) { - dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl; - //MPI_Request *req = new MPI_Request; - ASSERT(MPI_Isend((void*)(*it).c_str(), - (*it).length(), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - i++; - } - - // attach message to last send, so we can free it later - MPI_Request *req = unfinished_sends.back(); - unfinished_send_message[req] = m; - - dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl; - -#ifndef FUNNEL_MPI - sender_lock.Unlock(); -#endif - return 0; -} - - - -// get the tag for this thread - -#ifndef FUNNEL_MPI -static int get_thread_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} -#endif - - - -// recv event loop, for unsolicited messages. - -void* mpimessenger_loop(void*) -{ - dout(5) << "mpimessenger_loop start pid " << getpid() << endl; - - while (1) { - - // outgoing - mpi_reap_sends(); - -#ifdef FUNNEL_MPI - // check outgoing queue - out_queue_lock.Lock(); - if (outgoing.size()) { - dout(10) << outgoing.size() << " outgoing messages" << endl; - for (list::iterator it = outgoing.begin(); - it != outgoing.end(); - it++) { - mpi_send(*it, TAG_UNSOLICITED); - } - } - outgoing.clear(); - out_queue_lock.Unlock(); -#endif - - - // timer events? - if (pending_timer) { - dout(DBLVL) << "pending timer" << endl; - g_timer.execute_pending(); - } - - // done? - if (mpi_done && - incoming.empty() && - outgoing.empty() && - !pending_timer) break; - - - // incoming - Message *m = 0; - - if (incoming.size()) { - dout(12) << "loop pulling message off incoming" << endl; - m = incoming.front(); - incoming.pop_front(); - } - else { - // check mpi - dout(12) << "loop waiting for incoming messages" << endl; - - // get message - m = mpi_recv(TAG_UNSOLICITED); - } - - // dispatch? - if (m) { - int dest = m->get_dest(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - - dout(4) << "---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - who->dispatch(m); - } else { - dout (1) << "---- i don't know who " << dest << " is." << endl; - assert(0); - break; - } - } - - } - - dout(5) << "finishing async sends" << endl; - mpi_finish_sends(); - - g_timer.shutdown(); - - dout(5) << "mpimessenger_loop exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int mpimessenger_start() -{ - dout(5) << "starting thread" << endl; - - // start a thread - pthread_create(&thread_id, - NULL, - mpimessenger_loop, - 0); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -MPI_Request kick_req; -msg_envelope_t kick_env; - -void mpimessenger_kick_loop() -{ - // if we're same thread as the loop, no kicking necessary - if (pthread_self() == thread_id) return; - - kick_env.type = 0; - - sender_lock.Lock(); - ASSERT(MPI_Isend(&kick_env, // kick sync for now, but ONLY because it makes me feel safer. - sizeof(kick_env), - MPI_CHAR, - mpi_rank, - TAG_UNSOLICITED, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - sender_lock.Unlock(); -} - - -// stop thread - -void mpimessenger_stop() -{ - dout(5) << "mpimessenger_stop stopping thread" << endl; - - if (mpi_done) { - dout(1) << "mpimessenger_stop called, but already done!" << endl; - assert(!mpi_done); - } - - // set finish flag - mpi_done = true; - mpimessenger_kick_loop(); - - // wait for thread to stop - mpimessenger_wait(); -} - - -// wait for thread to finish - -void mpimessenger_wait() -{ - void *returnval; - dout(10) << "mpimessenger_wait waiting for thread to finished." << endl; - pthread_join(thread_id, &returnval); - dout(10) << "mpimessenger_wait thread finished." << endl; -} - - - - -/*********** - * MPIMessenger class implementation - */ - -class C_MPIKicker : public Context { - void finish(int r) { - dout(DBLVL) << "timer kick" << endl; - mpimessenger_kick_loop(); - } -}; - -MPIMessenger::MPIMessenger(entity_name_t myaddr) : Messenger(myaddr) -{ - // my address - this->myaddr = myaddr; - - // register myself in the messenger directory - directory[myaddr] = this; - - // register to execute timer events - g_timer.set_messenger_kicker(new C_MPIKicker()); - - // logger - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(whoami); - int w = MSG_ADDR_NUM(whoami); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mpimsg_logtype); - loggers[ whoami ] = logger; - */ -} - -MPIMessenger::~MPIMessenger() -{ - //delete logger; -} - - -int MPIMessenger::shutdown() -{ - // remove me from the directory - directory.erase(myaddr); - - // no more timer events - g_timer.unset_messenger_kicker(); - - // last one? - if (directory.empty()) { - dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl; - pthread_t whoami = pthread_self(); - - dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set mpi_done=true" << endl; - mpi_done = true; - } else { - // i am a different thread, tell the event loop to stop. - dout(15) << " calling mpimessenger_stop()" << endl; - mpimessenger_stop(); - } - } else { - dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl; - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int MPIMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(myaddr, fromport); - m->set_dest(dest, port); - -#ifdef FUNNEL_MPI - - // queue up - out_queue_lock.Lock(); - dout(DBLVL) << "queuing outgoing message " << *m << endl; - outgoing.push_back(m); - out_queue_lock.Unlock(); - - mpimessenger_kick_loop(); - -#else - - // send in this thread - mpi_send(m, m->get_pcid()); - -#endif - return 0; -} - - - - - - diff --git a/trunk/ceph/msg/MPIMessenger.h b/trunk/ceph/msg/MPIMessenger.h deleted file mode 100644 index 88e753de89749..0000000000000 --- a/trunk/ceph/msg/MPIMessenger.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPIMESSENGER_H -#define __MPIMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define MPI_DEST_TO_RANK(dest,world) ((dest)<(NUMMDS+NUMOSD) ? \ - (dest) : \ - ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD)))) - -class Timer; - -class MPIMessenger : public Messenger { - protected: - entity_name_t myaddr; // my address - //class Logger *logger; // for logging - - public: - MPIMessenger(entity_name_t myaddr); - ~MPIMessenger(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ -extern int mpimessenger_world(); // get world size -extern int mpimessenger_init(int& argc, char**& argv); // init mpi -extern int mpimessenger_start(); // start thread -extern void mpimessenger_stop(); // stop thread. -extern void mpimessenger_wait(); // wait for thread to finish. -extern int mpimessenger_shutdown(); // finalize MPI - - -#endif diff --git a/trunk/ceph/msg/MTMessenger.cc b/trunk/ceph/msg/MTMessenger.cc deleted file mode 100644 index 02ab9981ff353..0000000000000 --- a/trunk/ceph/msg/MTMessenger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include "mpi.h" - -#include "include/config.h" -#include "include/error.h" -#include "Messenger.h" -#include "MTMessenger.h" - -// This module uses MPI to implement a blocking sendrecv function that -// feels more like a procedure call and less like event processesing. -// -// Threads are not independently addressable in MPI, only processes -// are. However, MPI does include a user defined tag in the message -// envelope, and a reader may selectively read only messages with a -// matching tag. The modules assign an integer to each thread to use -// as the tag. -// - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -static pthread_mutex_t mutex; - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -static int nthreads; - -// the MPI identity of this process -static int mpi_rank; - - -// get the tag for this thread -static int get_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} - - -// marshall a message and send it over MPI -static void send(Message *m, int rank, int tag) -{ - // marshall the message - crope r; - m->encode(r); - int size = r.length(); - - char *buf = (char*)r.c_str(); - ASSERT(MPI_Send(buf, - size, - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD) == MPI_SUCCESS); -} - -// read a message from MPI and unmarshall it -static Message *receive(int tag) -{ - MPI_Status status; - - // get message size - ASSERT(MPI_Probe(MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // get message; there may be multiple messages on the queue, we - // need to be sure to read the one which corresponds to size - // obtained above. - char *buf = new char[status.count]; - ASSERT(MPI_Recv(buf, - status.count, - MPI_CHAR, - status.MPI_SOURCE, - status.MPI_TAG, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // unmarshall message - crope r(buf, status.count); - delete[] buf; - Message *m = decode_message(r); - - return m; -} - -MTMessenger::MTMessenger(int& argc, char**& argv) -{ - // setup MPI; MPI errors will probably invoke the default MPI error - // handler, which aborts the program with a friendly message rather - // than returning from a function; just in case, we abort the - // program if we get an MPI error. - - int provided; - ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided) - == MPI_SUCCESS); - - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS); - - if (pthread_mutex_init(&mutex, NULL) < 0) - SYSERROR(); - - if (pthread_key_create(&tag_key, NULL) < 0) - SYSERROR(); - - nthreads = 0; -} - -MTMessenger::~MTMessenger() -{ - // ignore shutdown errors - - pthread_key_delete(tag_key); - - pthread_mutex_destroy(&mutex); - - MPI_Finalize(); -} - -// send a request and wait for the response -Message *MTMessenger::sendrecv(Message *m, entity_name_t dest) -{ - int dest_tag = 0; // servers listen for any tag - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - m->set_source(mpi_rank, my_tag); - m->set_dest(dest, dest_tag); - - send(m, dest, dest_tag); - - return receive(my_tag); -} - -// receive a request from anyone -Message *MTMessenger::recvreq() -{ - return receive(MPI_ANY_TAG); -} - -// forward request, masquerading as original source -void MTMessenger::fwdreq(Message *req, int dest) -{ - int dest_tag = 0; // servers listen for any tag - - // set our envelope (not to be confused with the MPI envelope) - req->set_dest(dest, dest_tag); - - send(req, dest, dest_tag); -} - -// send a response to the originator of the request -void MTMessenger::sendresp(Message *req, Message *resp) -{ - int req_rank = req->get_source(); - int req_tag = req->get_source_port(); - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - resp->set_source(mpi_rank, my_tag); - resp->set_dest(req_rank, req_tag); - - send(resp, req_rank, req_tag); -} diff --git a/trunk/ceph/msg/MTMessenger.h b/trunk/ceph/msg/MTMessenger.h deleted file mode 100644 index 477a39c60561d..0000000000000 --- a/trunk/ceph/msg/MTMessenger.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MTMESSENGER_H -#define __MTMESSENGER_H - -#include "Message.h" -#include "SerialMessenger.h" - -// Marshall and unmarshall OBFS messages, send and receive them over -// MPI. - -class MTMessenger -{ -public: - // sets up the queues and internal thread; the MPI initialization - // will scan argc/argv for MPI specific flags and remove them from - // argc/argv. - MTMessenger(int &argc, char **&argv); - - // tears it all down - ~MTMessenger(); - - // send a request to a server and wait (block) for the response; - virtual Message *sendrecv(Message *m, entity_name_t dest); - - // wait (block) for a request from anyone - Message *recvreq(); - - // forward request, masquerading as original source - void fwdreq(Message *req, int dest); - - // send the response to the originator of the request - virtual void sendresp(Message *req, Message *resp); - - -}; // class MTMessenger - -#endif // __MTMESSENGER_H diff --git a/trunk/ceph/msg/Message.cc b/trunk/ceph/msg/Message.cc index ae01d9106ddaf..e96f076380459 100644 --- a/trunk/ceph/msg/Message.cc +++ b/trunk/ceph/msg/Message.cc @@ -9,16 +9,8 @@ using namespace std; #include "messages/MGenericMessage.h" -/* -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" -*/ - +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" #include "messages/MMonPaxos.h" #include "messages/MMonElectionAck.h" @@ -44,10 +36,12 @@ using namespace std; #include "messages/MOSDPGLog.h" #include "messages/MOSDPGRemove.h" -#include "messages/MClientBoot.h" #include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" +#include "messages/MClientUnmount.h" +#include "messages/MClientSession.h" +#include "messages/MClientReconnect.h" #include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" #include "messages/MClientReply.h" #include "messages/MClientFileCaps.h" @@ -56,7 +50,7 @@ using namespace std; #include "messages/MMDSBeacon.h" #include "messages/MMDSImportMap.h" #include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" +//#include "messages/MMDSCacheRejoinAck.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -64,32 +58,17 @@ using namespace std; #include "messages/MExportDirDiscover.h" #include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirCancel.h" #include "messages/MExportDirPrep.h" #include "messages/MExportDirPrepAck.h" #include "messages/MExportDirWarning.h" +#include "messages/MExportDirWarningAck.h" #include "messages/MExportDir.h" +#include "messages/MExportDirAck.h" #include "messages/MExportDirNotify.h" #include "messages/MExportDirNotifyAck.h" #include "messages/MExportDirFinish.h" -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" - -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirAck.h" -#include "messages/MHashDirNotify.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - #include "messages/MRenameWarning.h" #include "messages/MRenameNotify.h" #include "messages/MRenameNotifyAck.h" @@ -101,14 +80,11 @@ using namespace std; #include "messages/MHeartbeat.h" -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" +#include "messages/MAnchor.h" #include "messages/MInodeLink.h" #include "messages/MInodeLinkAck.h" //#include "messages/MInodeUpdate.h" -#include "messages/MInodeExpire.h" -#include "messages/MDirExpire.h" #include "messages/MCacheExpire.h" #include "messages/MInodeFileCaps.h" @@ -134,30 +110,12 @@ decode_message(msg_envelope_t& env, bufferlist& payload) // -- with payload -- - /* - case MSG_NS_CONNECT: - m = new MNSConnect(); - break; - case MSG_NS_CONNECTACK: - m = new MNSConnectAck(); + case MSG_MON_COMMAND: + m = new MMonCommand; break; - case MSG_NS_REGISTER: - m = new MNSRegister(); + case MSG_MON_COMMAND_ACK: + m = new MMonCommandAck; break; - case MSG_NS_REGISTERACK: - m = new MNSRegisterAck(); - break; - case MSG_NS_LOOKUP: - m = new MNSLookup(); - break; - case MSG_NS_LOOKUPREPLY: - m = new MNSLookupReply(); - break; - case MSG_NS_FAILURE: - m = new MNSFailure(); - break; - */ - case MSG_MON_PAXOS: m = new MMonPaxos; break; @@ -230,23 +188,29 @@ decode_message(msg_envelope_t& env, bufferlist& payload) break; // clients - case MSG_CLIENT_BOOT: - m = new MClientBoot(); - break; case MSG_CLIENT_MOUNT: - m = new MClientMount(); + m = new MClientMount; + break; + case MSG_CLIENT_UNMOUNT: + m = new MClientUnmount; + break; + case MSG_CLIENT_SESSION: + m = new MClientSession; break; - case MSG_CLIENT_MOUNTACK: - m = new MClientMountAck(); + case MSG_CLIENT_RECONNECT: + m = new MClientReconnect; break; case MSG_CLIENT_REQUEST: - m = new MClientRequest(); + m = new MClientRequest; + break; + case MSG_CLIENT_REQUEST_FORWARD: + m = new MClientRequestForward; break; case MSG_CLIENT_REPLY: - m = new MClientReply(); + m = new MClientReply; break; case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps(); + m = new MClientFileCaps; break; // mds @@ -265,9 +229,11 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_MDS_CACHEREJOIN: m = new MMDSCacheRejoin; break; + /* case MSG_MDS_CACHEREJOINACK: m = new MMDSCacheRejoinAck; break; + */ case MSG_MDS_DIRUPDATE: m = new MDirUpdate(); @@ -286,13 +252,18 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_MDS_EXPORTDIRDISCOVERACK: m = new MExportDirDiscoverAck(); break; + case MSG_MDS_EXPORTDIRCANCEL: + m = new MExportDirCancel(); + break; case MSG_MDS_EXPORTDIR: - m = new MExportDir(); + m = new MExportDir; + break; + case MSG_MDS_EXPORTDIRACK: + m = new MExportDirAck; break; - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish(); + m = new MExportDirFinish; break; case MSG_MDS_EXPORTDIRNOTIFY: @@ -312,57 +283,12 @@ decode_message(msg_envelope_t& env, bufferlist& payload) break; case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning(); + m = new MExportDirWarning; break; - - - case MSG_MDS_HASHREADDIR: - m = new MHashReaddir(); - break; - case MSG_MDS_HASHREADDIRREPLY: - m = new MHashReaddirReply(); - break; - - case MSG_MDS_HASHDIRDISCOVER: - m = new MHashDirDiscover(); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - m = new MHashDirDiscoverAck(); - break; - case MSG_MDS_HASHDIRPREP: - m = new MHashDirPrep(); - break; - case MSG_MDS_HASHDIRPREPACK: - m = new MHashDirPrepAck(); - break; - case MSG_MDS_HASHDIR: - m = new MHashDir(); - break; - case MSG_MDS_HASHDIRACK: - m = new MHashDirAck(); - break; - case MSG_MDS_HASHDIRNOTIFY: - m = new MHashDirNotify(); + case MSG_MDS_EXPORTDIRWARNINGACK: + m = new MExportDirWarningAck; break; - case MSG_MDS_UNHASHDIRPREP: - m = new MUnhashDirPrep(); - break; - case MSG_MDS_UNHASHDIRPREPACK: - m = new MUnhashDirPrepAck(); - break; - case MSG_MDS_UNHASHDIR: - m = new MUnhashDir(); - break; - case MSG_MDS_UNHASHDIRACK: - m = new MUnhashDirAck(); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - m = new MUnhashDirNotify(); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - m = new MUnhashDirNotifyAck(); - break; case MSG_MDS_RENAMEWARNING: m = new MRenameWarning(); @@ -398,11 +324,8 @@ decode_message(msg_envelope_t& env, bufferlist& payload) m = new MCacheExpire(); break; - case MSG_MDS_ANCHORREQUEST: - m = new MAnchorRequest(); - break; - case MSG_MDS_ANCHORREPLY: - m = new MAnchorReply(); + case MSG_MDS_ANCHOR: + m = new MAnchor(); break; case MSG_MDS_INODELINK: @@ -417,18 +340,10 @@ decode_message(msg_envelope_t& env, bufferlist& payload) break; */ - case MSG_MDS_INODEEXPIRE: - m = new MInodeExpire(); - break; - case MSG_MDS_INODEFILECAPS: m = new MInodeFileCaps(); break; - case MSG_MDS_DIREXPIRE: - m = new MDirExpire(); - break; - case MSG_MDS_LOCK: m = new MLock(); break; @@ -437,12 +352,9 @@ decode_message(msg_envelope_t& env, bufferlist& payload) // -- simple messages without payload -- case MSG_CLOSE: - case MSG_NS_STARTED: - case MSG_NS_UNREGISTER: case MSG_SHUTDOWN: case MSG_MDS_SHUTDOWNSTART: case MSG_MDS_SHUTDOWNFINISH: - case MSG_CLIENT_UNMOUNT: case MSG_OSD_MKFS_ACK: m = new MGenericMessage(env.type); break; diff --git a/trunk/ceph/msg/Message.h b/trunk/ceph/msg/Message.h index 80e1b9feaac28..42115808f5a2d 100644 --- a/trunk/ceph/msg/Message.h +++ b/trunk/ceph/msg/Message.h @@ -16,25 +16,13 @@ #define MSG_CLOSE 0 -#define MSG_NS_CONNECT 1 -#define MSG_NS_CONNECTACK 2 -#define MSG_NS_REGISTER 3 -#define MSG_NS_REGISTERACK 4 -#define MSG_NS_STARTED 5 -#define MSG_NS_UNREGISTER 6 -#define MSG_NS_LOOKUP 7 -#define MSG_NS_LOOKUPREPLY 8 -#define MSG_NS_FAILURE 9 - - #define MSG_PING 10 #define MSG_PING_ACK 11 -#define MSG_FAILURE 12 -#define MSG_FAILURE_ACK 13 - #define MSG_SHUTDOWN 99999 +#define MSG_MON_COMMAND 13 +#define MSG_MON_COMMAND_ACK 14 #define MSG_MON_ELECTION_ACK 15 @@ -73,16 +61,20 @@ #define MSG_OSD_PG_LOG 53 #define MSG_OSD_PG_REMOVE 54 -#define MSG_CLIENT_REQUEST 60 -#define MSG_CLIENT_REPLY 61 -//#define MSG_CLIENT_DONE 62 -#define MSG_CLIENT_FILECAPS 63 -#define MSG_CLIENT_INODEAUTHUPDATE 64 +// -- client -- +// to monitor +#define MSG_CLIENT_MOUNT 60 +#define MSG_CLIENT_UNMOUNT 61 + +// to mds +#define MSG_CLIENT_SESSION 70 // start or stop +#define MSG_CLIENT_RECONNECT 71 + +#define MSG_CLIENT_REQUEST 80 +#define MSG_CLIENT_REQUEST_FORWARD 81 +#define MSG_CLIENT_REPLY 82 +#define MSG_CLIENT_FILECAPS 83 -#define MSG_CLIENT_BOOT 70 -#define MSG_CLIENT_MOUNT 71 -#define MSG_CLIENT_MOUNTACK 72 -#define MSG_CLIENT_UNMOUNT 73 // *** MDS *** @@ -94,7 +86,6 @@ #define MSG_MDS_IMPORTMAP 106 #define MSG_MDS_CACHEREJOIN 107 -#define MSG_MDS_CACHEREJOINACK 108 #define MSG_MDS_DISCOVER 110 #define MSG_MDS_DISCOVERREPLY 111 @@ -113,42 +104,44 @@ #define MSG_MDS_CACHEEXPIRE 125 -#define MSG_MDS_ANCHORREQUEST 130 -#define MSG_MDS_ANCHORREPLY 131 +#define MSG_MDS_ANCHOR 130 #define MSG_MDS_INODELINK 140 #define MSG_MDS_INODELINKACK 141 #define MSG_MDS_INODEUNLINK 142 #define MSG_MDS_INODEUNLINKACK 143 -#define MSG_MDS_EXPORTDIRDISCOVER 150 -#define MSG_MDS_EXPORTDIRDISCOVERACK 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIR 155 -#define MSG_MDS_EXPORTDIRNOTIFY 156 -#define MSG_MDS_EXPORTDIRNOTIFYACK 157 -#define MSG_MDS_EXPORTDIRFINISH 158 - - -#define MSG_MDS_HASHDIRDISCOVER 160 -#define MSG_MDS_HASHDIRDISCOVERACK 161 -#define MSG_MDS_HASHDIRPREP 162 -#define MSG_MDS_HASHDIRPREPACK 163 -#define MSG_MDS_HASHDIR 164 -#define MSG_MDS_HASHDIRACK 165 -#define MSG_MDS_HASHDIRNOTIFY 166 - -#define MSG_MDS_HASHREADDIR 168 -#define MSG_MDS_HASHREADDIRREPLY 169 - -#define MSG_MDS_UNHASHDIRPREP 170 -#define MSG_MDS_UNHASHDIRPREPACK 171 -#define MSG_MDS_UNHASHDIR 172 -#define MSG_MDS_UNHASHDIRACK 173 -#define MSG_MDS_UNHASHDIRNOTIFY 174 -#define MSG_MDS_UNHASHDIRNOTIFYACK 175 +#define MSG_MDS_EXPORTDIRDISCOVER 149 +#define MSG_MDS_EXPORTDIRDISCOVERACK 150 +#define MSG_MDS_EXPORTDIRCANCEL 151 +#define MSG_MDS_EXPORTDIRPREP 152 +#define MSG_MDS_EXPORTDIRPREPACK 153 +#define MSG_MDS_EXPORTDIRWARNING 154 +#define MSG_MDS_EXPORTDIRWARNINGACK 155 +#define MSG_MDS_EXPORTDIR 156 +#define MSG_MDS_EXPORTDIRACK 157 +#define MSG_MDS_EXPORTDIRNOTIFY 158 +#define MSG_MDS_EXPORTDIRNOTIFYACK 159 +#define MSG_MDS_EXPORTDIRFINISH 160 + + +#define MSG_MDS_HASHDIRDISCOVER 170 +#define MSG_MDS_HASHDIRDISCOVERACK 171 +#define MSG_MDS_HASHDIRPREP 172 +#define MSG_MDS_HASHDIRPREPACK 173 +#define MSG_MDS_HASHDIR 174 +#define MSG_MDS_HASHDIRACK 175 +#define MSG_MDS_HASHDIRNOTIFY 176 + +#define MSG_MDS_HASHREADDIR 178 +#define MSG_MDS_HASHREADDIRREPLY 179 + +#define MSG_MDS_UNHASHDIRPREP 180 +#define MSG_MDS_UNHASHDIRPREPACK 181 +#define MSG_MDS_UNHASHDIR 182 +#define MSG_MDS_UNHASHDIRACK 183 +#define MSG_MDS_UNHASHDIRNOTIFY 184 +#define MSG_MDS_UNHASHDIRNOTIFYACK 185 #define MSG_MDS_DENTRYUNLINK 200 @@ -175,9 +168,7 @@ using std::list; #include -#include -using __gnu_cxx::crope; #include "include/types.h" #include "include/buffer.h" @@ -238,6 +229,9 @@ public: void set_payload(bufferlist& bl) { payload.claim(bl); } + void copy_payload(bufferlist& bl) { + payload = bl; + } msg_envelope_t& get_envelope() { return env; } @@ -277,33 +271,8 @@ public: payload.clear(); } - // overload either the rope version (easier!) - virtual void encode_payload(crope& s) { assert(0); } - virtual void decode_payload(crope& s, int& off) { assert(0); } - - // of the bufferlist versions (faster!) - virtual void decode_payload() { - // use a crope for convenience, small messages, etc. FIXME someday. - crope ser; - for (list::const_iterator it = payload.buffers().begin(); - it != payload.buffers().end(); - it++) - ser.append((*it).c_str(), (*it).length()); - - int off = 0; - decode_payload(ser, off); - assert((unsigned)off == payload.length()); - } - virtual void encode_payload() { - assert(payload.length() == 0); // caller should reset payload - - // use crope for convenience, small messages. FIXME someday. - crope r; - encode_payload(r); - - // copy payload - payload.push_back( buffer::copy(r.c_str(), r.length()) ); - } + virtual void decode_payload() = 0; + virtual void encode_payload() = 0; virtual void print(ostream& out) { out << get_type_name(); diff --git a/trunk/ceph/msg/NewMessenger.cc b/trunk/ceph/msg/NewMessenger.cc deleted file mode 100644 index 1455c31724c68..0000000000000 --- a/trunk/ceph/msg/NewMessenger.cc +++ /dev/null @@ -1,1714 +0,0 @@ - -#include "NewMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - Receiver *r = new Receiver(sd); - r->create(); - - rank.lock.Lock(); - rank.receivers.insert(r); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - -/************************************** - * Receiver - */ - -void *Rank::Receiver::entry() -{ - while (!done) { - Message *m = read_message(); - if (!m) { - ::close(sd); - break; - } - - dout(10) << "receiver.entry got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.down.count(m->get_dest())) { - dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl; - delete m; - - if (rank.looking_up.count(m->get_dest()) == 0) - rank.lookup(m->get_dest()); - } - else if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " < " << rank.entity_map[m->get_source()] - << ", dropping " << *m << endl; - delete m; - } - else { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - rank.entity_map[m->get_source()] = m->get_source_inst(); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - // add to reap queue - rank.lock.Lock(); - rank.receiver_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); - - return 0; -} - -Message *Rank::Receiver::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(10) << "receiver got dummy env, bailing" << endl; - return 0; - } - - dout(20) << "receiver got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - -/************************************** - * Sender - */ - -int Rank::Sender::connect() -{ - dout(10) << "sender(" << inst << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr)); - if (r < 0) return r; - - // identify myself - // FIXME - - return 0; -} - - -void Rank::Sender::finish() -{ - dout(10) << "sender(" << inst << ").finish" << endl; - - // make sure i get reaped. - rank.lock.Lock(); - rank.sender_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); -} - -void Rank::Sender::fail_and_requeue(list& out) -{ - dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // old and unnecessary? - if (0) - rank.messenger->send_message(new MNSFailure(inst), - MSG_ADDR_NAMER(0)); - - - // FIXME: possible race before i reclaim lock here? - - Dispatcher *dis = 0; - entity_name_t dis_dest; - - list lost; - - // requeue my messages - rank.lock.Lock(); - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - dout(10) << "sender(" << inst << ").fail " - << q.size() << " messages" << endl; - - if (0) { - lost.swap(q); - } else { - - while (!q.empty()) { - // don't keep reconnecting.. - if (rank.entity_map.count(q.front()->get_dest()) && - rank.entity_map[q.front()->get_dest()] == inst) - rank.down.insert(q.front()->get_dest()); - //rank.entity_map.erase(q.front()->get_dest()); - - if (!dis && - rank.local.count(q.front()->get_source())) { - dis_dest = q.front()->get_dest(); - dis = rank.local[q.front()->get_source()]->get_dispatcher(); - } - - if (g_conf.ms_requeue_on_sender_fail) - rank.submit_message( q.front() ); - else - lost.push_back( q.front() ); - q.pop_front(); - } - } - - // deactivate myself - if (rank.rank_sender.count(inst.rank) && - rank.rank_sender[inst.rank] == this) - rank.rank_sender.erase(inst.rank); - - // stop sender loop - done = true; - } - lock.Unlock(); - - - // send special failure msg? - if (dis) { - for (list::iterator p = lost.begin(); - p != lost.end(); - p++) - dis->ms_handle_failure(*p, dis_dest, inst); - } - - rank.lock.Unlock(); -} - -void *Rank::Sender::entry() -{ - // connect - if (sd == 0) { - int rc = connect(); - if (rc < 0) { - list out; - derr(0) << "error connecting to " << inst << endl; - fail_and_requeue(out); - finish(); - return 0; - } - } - - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "sender(" << inst << ") grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "sender(" << inst << ") sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl; - out.push_front(m); - fail_and_requeue(out); - break; - } - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "sender(" << inst << ") sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - finish(); - return 0; -} - - -int Rank::Sender::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << " sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(20) << "error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { - derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - // delete message - delete m; - return 0; -} - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - assert(lock.is_locked()); - - while (!receiver_reap_queue.empty()) { - Receiver *r = receiver_reap_queue.front(); - receiver_reap_queue.pop_front(); - //dout(10) << "reaper reaping receiver sd " << r->sd << endl; - receivers.erase(r); - r->join(); - dout(10) << "reaper reaped receiver sd " << r->sd << endl; - delete r; - } - - while (!sender_reap_queue.empty()) { - Sender *s = sender_reap_queue.front(); - sender_reap_queue.pop_front(); - //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl; - if (rank_sender.count(s->inst.rank) && - rank_sender[s->inst.rank] == s) - rank_sender.erase(s->inst.rank); - s->join(); - dout(10) << "reaper reaped sender " << s->inst << endl; - delete s; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - sender->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - entity_map[MSG_ADDR_NAMER(0)].addr = ns; - entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Sender *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create sender - Sender *sender = new Sender(inst); - //int rc = sender->connect(); - //assert(rc >= 0); - - // start thread. - sender->create(); - - // old sender? - assert(rank.rank_sender.count(inst.rank) == 0); - //if (rank.rank_sender.count(r)) - //rank.rank_sender[r]->stop(); - - // ok! - rank.rank_sender[inst.rank] = sender; - return sender; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_DIRECTORY); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0)); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - -void Rank::prepare_dest(entity_name_t dest) -{ - lock.Lock(); - - if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - //waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - //sender = rank_sender[ inst.rank ]; - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - } - - lock.Unlock(); -} - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - assert(0); - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_sender.count( dest_inst.rank )) { - //&& - //rank_sender[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl; - // connected. - sender = rank_sender[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - sender = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - if (down.count(dest)) { - // black hole. - dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl; - delete m; - - if (looking_up.count(dest) == 0) - lookup(dest); - - } else if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - sender = rank_sender[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - sender = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (down.count(addr)) { - // ignore - dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl; - //assert(entity_map.count(addr) == 0); - continue; - } - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_sender.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_sender[inst.rank]->inst != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Sender *sender = rank_sender[inst.rank]; - assert(sender); - - if (waiting_for_lookup.count(addr)) { - sender->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead senders, receivers. - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap senders and receivers - lock.Lock(); - { - dout(10) << "wait: stopping senders" << endl; - for (hash_map::iterator i = rank_sender.begin(); - i != rank_sender.end(); - i++) - i->second->stop(); - while (!rank_sender.empty()) { - wait_cond.Wait(lock); - reaper(); - } - - if (0) { // stop() no worky on receivers! we leak, but who cares. - dout(10) << "wait: stopping receivers" << endl; - for (set::iterator i = receivers.begin(); - i != receivers.end(); - i++) - (*i)->stop(); - while (!receivers.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0)); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_send_message(entity_name_t dest) -{ - rank.prepare_dest(dest); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - // set envelope - m->set_source(get_myaddr(), 0); - m->set_dest(dest, 0); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ?" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (down.count(a) == 0) { - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - down.insert(a); - - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_sender.count(inst.rank)) { - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - } - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - down.erase(a); - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/trunk/ceph/msg/NewMessenger.h b/trunk/ceph/msg/NewMessenger.h deleted file mode 100644 index 0e04315a10883..0000000000000 --- a/trunk/ceph/msg/NewMessenger.h +++ /dev/null @@ -1,305 +0,0 @@ -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Sender; - class Receiver; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - class Receiver : public Thread { - public: - int sd; - bool done; - - Receiver(int _sd) : sd(_sd), done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(sd); - //join(); - } - Message *read_message(); - }; - - - // outgoing - class Sender : public Thread { - public: - entity_inst_t inst; - bool done; - int sd; - - set entities; - list q; - - Mutex lock; - Cond cond; - - Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {} - virtual ~Sender() {} - - void *entry(); - - int connect(); - void fail_and_requeue(list& ls); - void finish(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - int write_message(Message *m); - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_send_message(entity_name_t dest); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - hash_set down; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_sender; - - set receivers; - - list sender_reap_queue; - list receiver_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Sender *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(entity_name_t dest); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - -extern Rank rank; - -#endif diff --git a/trunk/ceph/msg/NewerMessenger.cc b/trunk/ceph/msg/NewerMessenger.cc deleted file mode 100644 index c277eea4b409b..0000000000000 --- a/trunk/ceph/msg/NewerMessenger.cc +++ /dev/null @@ -1,1791 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "NewerMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - if (peer_inst.rank >= 0) { - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // low ranks' Pipes "win" - if (peer_inst.rank < rank.my_inst.rank || - rank.my_inst.rank < 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_inst.rank]->close(); // close old one - rank.rank_pipe[peer_inst.rank] = this; - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl; - } - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr)); - if (rc < 0) return rc; - - // identify peer - entity_inst_t inst; - rc = tcp_read(sd, (char*)&inst, sizeof(inst)); - if (inst.rank < 0) - inst = peer_inst; // i know better than they do. - if (peer_inst != inst && inst.rank > 0) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl; - assert(0); - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - if (sent_close) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl; - return; - } - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_inst.rank); - } - } - rank.lock.Unlock(); - - // queue close message. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - sent_close = true; - lock.Unlock(); -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl; - } else { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl; - } - - if (!sent_close) - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - assert(0); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - //rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) - rank.rank_pipe.erase(peer_inst.rank); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher(); - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - else { - // oh well. sending entity musta just shut down? - assert(0); - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl; - i->first->ms_handle_failure(*k, j->first, peer_inst); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - pipe->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); - namer_inst = my_inst; -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns; - namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create pipe - Pipe *pipe = new Pipe(inst); - rank.rank_pipe[inst.rank] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0), namer_inst); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl; - assert(0); // hmpf - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_pipe.count( dest_inst.rank )) { - //&& - //rank_pipe[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_pipe.count( inst.rank ) && - rank_pipe[inst.rank]->get_peer_inst() == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - pipe = rank_pipe[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_pipe.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_pipe[inst.rank]->get_peer_inst() != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Pipe *pipe = rank_pipe[inst.rank]; - assert(pipe); - - if (waiting_for_lookup.count(addr)) { - pipe->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0) && - get_myaddr() != MSG_ADDR_RANK(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(inst.rank) == 0) - rank.connect_rank(inst); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ? (DEPRECATED)" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst.rank)) { - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - } - - // kill rank# too? only if i'm the namer. - if (my_rank == 0) { - entity_map.erase(MSG_ADDR_RANK(inst.rank)); - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/trunk/ceph/msg/NewerMessenger.h b/trunk/ceph/msg/NewerMessenger.h deleted file mode 100644 index 29b885745df48..0000000000000 --- a/trunk/ceph/msg/NewerMessenger.h +++ /dev/null @@ -1,343 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Pipe; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - - class Pipe { - protected: - int sd; - bool done; - entity_inst_t peer_inst; - bool server; - bool sent_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_inst_t &pi) : sd(0), - done(false), peer_inst(pi), server(false), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_inst_t& pi); - - entity_inst_t& get_peer_inst() { return peer_inst; } - - void close(); - void join() { - writer_thread.join(); - reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_dest(const entity_inst_t& inst); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port=0, int fromport=0); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - entity_inst_t namer_inst; - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Pipe *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(const entity_inst_t& inst); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/trunk/ceph/msg/SimpleMessenger.cc b/trunk/ceph/msg/SimpleMessenger.cc index de90acaafd6ac..35b141c1dc0fb 100644 --- a/trunk/ceph/msg/SimpleMessenger.cc +++ b/trunk/ceph/msg/SimpleMessenger.cc @@ -40,6 +40,8 @@ Rank rank; +sighandler_t old_sigint_handler; + /******************************************** * Accepter @@ -48,14 +50,23 @@ Rank rank; void simplemessenger_sigint(int r) { rank.sigint(); + old_sigint_handler(r); } void Rank::sigint() { lock.Lock(); derr(0) << "got control-c, exiting" << endl; + + // force close listener socket ::close(accepter.listen_sd); - _exit(-1); + + // force close all pipe sockets, too + for (hash_map::iterator p = rank_pipe.begin(); + p != rank_pipe.end(); + ++p) + p->second->force_close(); + lock.Unlock(); } @@ -117,7 +128,7 @@ int Rank::Accepter::start() dout(10) << "accepter.start my addr is " << rank.my_addr << endl; // set up signal handler - signal(SIGINT, simplemessenger_sigint); + old_sigint_handler = signal(SIGINT, simplemessenger_sigint); // start thread create(); @@ -236,7 +247,7 @@ int Rank::Pipe::connect() return rc; } - // identify peer + // identify peer ..... FIXME entity_addr_t paddr; rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); if (!rc) { // bool @@ -244,8 +255,9 @@ int Rank::Pipe::connect() return -1; } if (peer_addr != paddr) { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").connect peer is " << paddr << ", wtf" << endl; - assert(0); + dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect peer identifies itself as " << paddr << ", wrong guy!" << endl; + ::close(sd); + sd = 0; return -1; } @@ -683,7 +695,8 @@ void Rank::Pipe::fail(list& out) k != j->second.end(); ++k) { derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << endl; - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); + if (i->first) + i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); } } @@ -1004,6 +1017,7 @@ void Rank::wait() lock.Unlock(); dout(10) << "wait: done." << endl; + dout(1) << "shutdown complete." << endl; } @@ -1089,10 +1103,10 @@ int Rank::EntityMessenger::shutdown() // stop my dispatch thread if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; + dout(10) << "shutdown i am dispatch, setting stop flag" << endl; stop = true; } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; + dout(10) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; lock.Lock(); stop = true; cond.Signal(); diff --git a/trunk/ceph/msg/SimpleMessenger.h b/trunk/ceph/msg/SimpleMessenger.h index f4cdcd67a84eb..d49b4118bec73 100644 --- a/trunk/ceph/msg/SimpleMessenger.h +++ b/trunk/ceph/msg/SimpleMessenger.h @@ -137,8 +137,8 @@ private: void close(); void join() { - writer_thread.join(); - reader_thread.join(); + if (writer_thread.is_started()) writer_thread.join(); + if (reader_thread.is_started()) reader_thread.join(); } void send(Message *m) { @@ -153,6 +153,10 @@ private: cond.Signal(); lock.Unlock(); } + + void force_close() { + ::close(sd); + } }; diff --git a/trunk/ceph/msg/TCPDirectory.cc b/trunk/ceph/msg/TCPDirectory.cc deleted file mode 100644 index 57000ac30d74c..0000000000000 --- a/trunk/ceph/msg/TCPDirectory.cc +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "TCPDirectory.h" - -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -//#include "messages/MNSUnregister.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: " - -void tcp_open(int rank); - - -void TCPDirectory::handle_connect(MNSConnect *m) -{ - int rank = nrank++; - dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl; - - dir[MSG_ADDR_RANK(rank)] = rank; - messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank); - - rank_addr[rank] = m->get_addr(); - messenger->map_rank_addr(rank, m->get_addr()); - - messenger->send_message(new MNSConnectAck(rank), - MSG_ADDR_RANK(rank)); - delete m; -} - - - -void TCPDirectory::handle_register(MNSRegister *m) -{ - dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl; - - // pick id - int rank = m->get_rank(); - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - - case MSG_ADDR_RANK_BASE: // stupid client should be able to figure this out - entity = MSG_ADDR_RANK(rank); - break; - - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - assert(dir.count(entity) == 0); // make sure it doesn't exist yet. - } - - dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl; - - // register - dir[entity] = rank; - - if (entity == MSG_ADDR_RANK(rank)) // map this locally now so we can reply - messenger->map_entity_rank(entity, rank); // otherwise wait until they send STARTED msg - - hold.insert(entity); - - ++version; - update_log[version] = entity; - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - MSG_ADDR_RANK(rank)); - delete m; -} - -void TCPDirectory::handle_started(Message *m) -{ - entity_name_t entity = m->get_source(); - - dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl; - hold.erase(entity); - messenger->map_entity_rank(entity, dir[entity]); - - // waiters? - if (waiting.count(entity)) { - list ls; - ls.splice(ls.begin(), waiting[entity]); - waiting.erase(entity); - - dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - dispatch(*it); - } - } -} - -void TCPDirectory::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl; - - assert(dir.count(who)); - dir.erase(who); - - // shutdown? - if (dir.size() <= 2) { - dout(2) << "dir is empty except for me, shutting down" << endl; - tcpmessenger_stop_nameserver(); - } - else { - if (0) { - dout(10) << "dir size now " << dir.size() << endl; - for (hash_map::iterator it = dir.begin(); - it != dir.end(); - it++) { - dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - } - } - } - -} - - -void TCPDirectory::handle_lookup(MNSLookup *m) -{ - // have it? - if (dir.count(m->get_entity()) == 0 || - hold.count(m->get_entity())) { - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - int rank = dir[m->get_entity()]; - reply->entity_map[m->get_entity()] = rank; - reply->rank_addr[rank] = rank_addr[rank]; - - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl; - - messenger->send_message(reply, - m->get_source(), m->get_source_port()); - delete m; -} diff --git a/trunk/ceph/msg/TCPDirectory.h b/trunk/ceph/msg/TCPDirectory.h deleted file mode 100644 index 7f450e9a64be5..0000000000000 --- a/trunk/ceph/msg/TCPDirectory.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPDIRECTORY_H -#define __TCPDIRECTORY_H - -/* - * rank -- a process (listening on some host:port) - * entity -- a logical entity (osd123, mds3, client3245, etc.) - * - * multiple entities can coexist on a single rank. - */ - -#include "Dispatcher.h" -#include "TCPMessenger.h" - -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -//#include -#include - -class TCPDirectory : public Dispatcher { - protected: - // how i communicate - TCPMessenger *messenger; - - // directory - hash_map dir; // entity -> rank - hash_map rank_addr; // rank -> ADDR (e.g. host:port) - - __uint64_t version; - map<__uint64_t, entity_name_t> update_log; - - int nrank; - int nclient, nmds, nosd; - - set hold; - map > waiting; - - // messages - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - - public: - TCPDirectory(TCPMessenger *m) : - messenger(m), - version(0), - nrank(0), nclient(0), nmds(0), nosd(0) { - messenger->set_dispatcher(this); - - // i am rank 0! - dir[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = m->get_tcpaddr(); - ++nrank; - - // announce nameserver - cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl; - - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - } - ~TCPDirectory() { - ::unlink(".ceph_ns"); - } - - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - - default: - assert(0); - } - } -}; - -#endif diff --git a/trunk/ceph/msg/TCPMessenger.cc b/trunk/ceph/msg/TCPMessenger.cc deleted file mode 100644 index f40ea9b162e6b..0000000000000 --- a/trunk/ceph/msg/TCPMessenger.cc +++ /dev/null @@ -1,1454 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "TCPMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -# include -# include -# include -# include -#include -#include -#include -#include - -#include - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" - -#include "TCPDirectory.h" - -#include "common/Logger.h" - -#define DBL 18 - -//#define TCP_SERIALMARSHALL // do NOT turn this off until you check messages/* encode_payload methods -//#define TCP_SERIALOUT // be paranoid/annoying and send messages in same thread - - -TCPMessenger *rankmessenger = 0; // - -TCPDirectory *nameserver = 0; // only defined on rank 0 -TCPMessenger *nsmessenger = 0; - - -/***************************/ -LogType rank_logtype; -Logger *logger; - -int stat_num = 0; -off_t stat_inq = 0, stat_inqb = 0; -off_t stat_disq = 0, stat_disqb = 0; -off_t stat_outq = 0, stat_outqb = 0; -/***************************/ - - -// local directory -hash_map directory; // local -hash_set directory_ready; -Mutex directory_lock; - -// connecting -struct sockaddr_in listen_addr; // my listen addr -int listen_sd = 0; -int my_rank = -1; -Cond waiting_for_rank; - -// register -long regid = 0; -map waiting_for_register_cond; -map waiting_for_register_result; - -// incoming messages -list incoming; -Mutex incoming_lock; -Cond incoming_cond; - -// outgoing messages -/* -list outgoing; -Mutex outgoing_lock; -Cond outgoing_cond; -*/ - -class OutThread : public Thread { -public: - Mutex lock; - Cond cond; - list q; - bool done; - - OutThread() : done(false) {} - virtual ~OutThread() {} - - void *entry(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } -} single_out_thread; - -Mutex lookup_lock; // -hash_map entity_rank; // entity -> rank -hash_map rank_sd; // outgoing sockets, rank -> sd -hash_map rank_out; -hash_map rank_addr; // rank -> tcpaddr -map > waiting_for_lookup; - - -/* this process */ -bool tcp_done = false; // set this flag to stop the event loop - - -// threads -pthread_t dispatch_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t out_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t listen_thread_id = 0; -map in_threads; // sd -> threadid - -//bool pending_timer = false; - -// per-rank fun - - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */ << "] " - - -#include "tcp.cc" - -// some declarations -void tcp_open(int rank); -int tcp_send(Message *m); -void tcpmessenger_kick_dispatch_loop(); -OutThread *tcp_lookup(Message *m); - -int tcpmessenger_get_rank() -{ - return my_rank; -} - - -int tcpmessenger_findns(tcpaddr_t &nsa) -{ - char *nsaddr = 0; - bool have_nsa = false; - - // env var? - /*int e_len = 0; - for (int i=0; envp[i]; i++) - e_len += strlen(envp[i]) + 1; - */ - nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - } - - else { - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - have_nsa = true; - nsaddr = "from .ceph_ns"; - } - } - - if (!nsaddr && !have_nsa) { - cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl; - return -1; - //exit(-1); - } - - // look up nsaddr? - if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) { - return -1; - } - - dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl; - return 0; -} - - - -/** rankserver - * - * one per rank. handles entity->rank lookup replies. - */ - -class RankServer : public Dispatcher { -public: - void dispatch(Message *m) { - lookup_lock.Lock(); - - dout(DBL) << "rankserver dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lookup_lock.Unlock(); - } - - void handle_connect_ack(MNSConnectAck *m) { - dout(DBL) << "my rank is " << m->get_rank(); - my_rank = m->get_rank(); - - // now that i know my rank, - entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank; - rank_addr[my_rank] = listen_addr; - - waiting_for_rank.SignalAll(); - - delete m; - - // logger! - dout(DBL) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - - } - - void handle_register_ack(MNSRegisterAck *m) { - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; - } - - void handle_lookup_reply(MNSLookupReply *m) { - list waiting; - dout(DBL) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_rank.begin(); - it != m->entity_rank.end(); - it++) { - dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - entity_rank[it->first] = it->second; - - if (it->second == my_rank) { - // deliver locally - dout(-DBL) << "delivering lookup results locally" << endl; - incoming_lock.Lock(); - - for (list::iterator i = waiting_for_lookup[it->first].begin(); - i != waiting_for_lookup[it->first].end(); - i++) { - stat_inq++; - stat_inqb += (*i)->get_payload().length(); - (*i)->decode_payload(); - incoming.push_back(*i); - } - incoming_cond.Signal(); - incoming_lock.Unlock(); - } else { - // take waiters - waiting.splice(waiting.begin(), waiting_for_lookup[it->first]); - } - waiting_for_lookup.erase(it->first); - - } - - for (map::iterator it = m->rank_addr.begin(); - it != m->rank_addr.end(); - it++) { - dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl; - rank_addr[it->first] = it->second; - - // open it now - if (rank_sd.count(it->first) == 0) - tcp_open(it->first); - } - - // send waiting messages -#ifdef TCP_SERIALOUT - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - tcp_send(*it); - } -#else - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - outt->send(*it); -// dout(0) << "lookup done, splicing in " << *it << endl; - } -#endif - - delete m; - } - -} rankserver; - - -class C_TCPKicker : public Context { - void finish(int r) { - dout(DBL) << "timer kick" << endl; - tcpmessenger_kick_dispatch_loop(); - } -}; - -void TCPMessenger::callback_kick() -{ - tcpmessenger_kick_dispatch_loop(); -} - - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} - - - -/***** - * global methods for process-wide startup, shutdown. - */ - -int tcpmessenger_init() -{ - // LISTEN - dout(DBL) << "binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - dout(DBL) << "listening on " << myport << endl; - - // my address is... - char host[100]; - gethostname(host, 100); - dout(DBL) << "my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(DBL) << "listen addr is " << listen_addr << endl; - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - - - dout(DBL) << "init done" << endl; - return 0; -} - - -// on first rank only -void tcpmessenger_start_nameserver(tcpaddr_t& diraddr) -{ - dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl; - - // i am rank 0. - nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY); - - // start name server - nameserver = new TCPDirectory(nsmessenger); - - // diraddr is my addr! - diraddr = rank_addr[0] = listen_addr; - my_rank = 0; - entity_rank[MSG_ADDR_DIRECTORY] = 0; -} -void tcpmessenger_stop_nameserver() -{ - if (nsmessenger) { - dout(DBL) << "shutting down nsmessenger" << endl; - TCPMessenger *m = nsmessenger; - nsmessenger = 0; - m->shutdown(); - delete m; - } -} - -// on all ranks -void tcpmessenger_start_rankserver(tcpaddr_t& ns) -{ - // connect to nameserver - entity_rank[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = ns; - tcp_open(0); - - if (my_rank >= 0) { - // i know my rank - rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank)); - } else { - // start rank messenger, and discover my rank. - rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW); - } -} -void tcpmessenger_stop_rankserver() -{ - if (rankmessenger) { - dout(DBL) << "shutting down rankmessenger" << endl; - rankmessenger->shutdown(); - delete rankmessenger; - rankmessenger = 0; - } -} - - - - - - -int tcpmessenger_shutdown() -{ - dout(DBL) << "tcpmessenger_shutdown barrier" << endl; - - - dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl; - - // bleh - for (hash_map::iterator it = rank_sd.begin(); - it != rank_sd.end(); - it++) { - ::close(it->second); - } - - return 0; -} - - - - -/*** - * internal send/recv - */ - - - - -/* - * recv a Message* - */ - - - -Message *tcp_recv(int sd) -{ - // envelope - dout(DBL) << "tcp_recv receiving message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(DBL) << "got dummy env, bailing" << endl; - return 0; - } - - dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl; - - // payload - bufferlist blist; - for (int i=0; iinc("in"); - logger->inc("inb", s+sizeof(env)); - } - - dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl; - - return m; -} - - - - -void tcp_open(int rank) -{ - dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl; - - // create socket? - int sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr)); - assert(r >= 0); - - //dout(DBL) << "tcp_open connected to " << who << endl; - assert(rank_sd.count(rank) == 0); - rank_sd[rank] = sd; - - if (g_conf.tcp_multi_out) { - rank_out[rank] = new OutThread(); - rank_out[rank]->create(); - } else { - rank_out[rank] = &single_out_thread; - if (!single_out_thread.is_started()) - single_out_thread.create(); - } -} - - -void tcp_marshall(Message *m) -{ - // marshall - if (m->empty_payload()) - m->encode_payload(); -} - -OutThread *tcp_lookup(Message *m) -{ - entity_name_t addr = m->get_dest(); - - if (!entity_rank.count(m->get_dest())) { - // lookup and wait. - if (waiting_for_lookup.count(addr)) { - dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl; - } else { - dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl; - MNSLookup *r = new MNSLookup(addr); - rankmessenger->send_message(r, MSG_ADDR_DIRECTORY); - } - - // add waiter - waiting_for_lookup[addr].push_back(m); - return 0; - } - - int rank = entity_rank[m->get_dest()]; - - if (rank_sd.count(rank) == 0) { // should only happen on rank0? - tcp_open(rank); - } - assert(rank_sd.count(rank)); - m->set_tcp_sd( rank_sd[rank] ); - return rank_out[rank]; -} - - -/* - * send a Message* over the wire. ** do not block **. - */ -int tcp_send(Message *m) -{ - /*int rank = entity_rank[m->get_dest()]; - //if (rank_sd.count(rank) == 0) tcp_open(rank); - assert(rank_sd.count(rank)); - - int sd = rank_sd[rank]; - assert(sd); - */ - int sd = m->get_tcp_sd(); - assert(sd); - - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - // HACK osd -> client only - //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) && - // m->get_dest() >= MSG_ADDR_CLIENT(0)) - dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) - //<< " rank " << rank - << " sd " << sd << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); } - } -#endif - - // hose message - delete m; - return 0; -} - - - - - -/** tcp_outthread - * this thread watching the outgoing queue, and encodes+sends any queued messages - */ - -void* OutThread::entry() -{ - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(DBL) << "outthread grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.splice(out.begin(), q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(DBL) << "outthread sending " << m << endl; - - if (!g_conf.tcp_serial_marshall) - tcp_marshall(m); - - tcp_send(m); - } - - lock.Lock(); - continue; - } - - // wait - dout(DBL) << "outthread sleeping" << endl; - cond.Wait(lock); - } - dout(DBL) << "outthread done" << endl; - - lock.Unlock(); - return 0; -} - - - -/** tcp_inthread - * read incoming messages from a given peer. - * give received and decoded messages to dispatch loop. - */ -void *tcp_inthread(void *r) -{ - int sd = (int)r; - - dout(DBL) << "tcp_inthread reading on sd " << sd << endl; - - while (!tcp_done) { - Message *m = tcp_recv(sd); - if (!m) break; - entity_name_t who = m->get_source(); - - dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl; - - // give to dispatch loop - size_t sz = m->get_payload().length(); - - if (g_conf.tcp_multi_dispatch) { - const entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - TCPMessenger *messenger = directory[ dest ]; - directory_lock.Unlock(); - - if (messenger) - messenger->dispatch_queue(m); - else - dout(0) << "dest " << dest << " dne" << endl; - - } else { - // single dispatch queue - incoming_lock.Lock(); - { - //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - incoming.push_back(m); - incoming_cond.Signal(); - - stat_inq++; - //assert(stat_inq == incoming.size()); - //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - stat_inqb += sz; - } - incoming_lock.Unlock(); - } - - if (logger) { - //logger->inc("in"); - //logger->inc("inb", sz); - } - } - - dout(DBL) << "tcp_inthread closing " << sd << endl; - - //::close(sd); - return 0; -} - -/** tcp_accepthread - * accept incoming connections from peers. - * start a tcp_inthread for each. - */ -void *tcp_acceptthread(void *) -{ - dout(DBL) << "tcp_acceptthread starting" << endl; - - while (!tcp_done) { - //dout(DBL) << "accepting, left = " << left << endl; - - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(DBL) << "accepted incoming on sd " << sd << endl; - - pthread_t th; - pthread_create(&th, - NULL, - tcp_inthread, - (void*)sd); - in_threads[sd] = th; - } else { - dout(DBL) << "no incoming connection?" << endl; - break; - } - } - return 0; -} - - - - -/** tcp_dispatchthread - * wait for pending timers, incoming messages. dispatch them. - */ -void TCPMessenger::dispatch_entry() -{ - incoming_lock.Lock(); - while (!incoming.empty() || !incoming_stop) { - if (!incoming.empty()) { - // grab incoming messages - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - - // drop lock while we deliver - //assert(stat_inq == incoming.size()); - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - dispatch(m); - } - - continue; - } - - // sleep - dout(DBL) << "dispatch: waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - incoming_lock.Unlock(); -} - - -void* tcp_dispatchthread(void*) -{ - dout(5) << "tcp_dispatchthread start pid " << getpid() << endl; - - while (1) { - // inq? - incoming_lock.Lock(); - - // done? - if (tcp_done && incoming.empty()) { - incoming_lock.Unlock(); - break; - } - - // wait? - if (incoming.empty()) { - // wait - dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - - // grab incoming messages - //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - //assert(stat_inq == incoming.size()); - //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - - // drop lock while we deliver - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(DBL) << "dispatch doing " << *m << endl; - - // for rankserver? - if (m->get_type() == MSG_NS_CONNECTACK || // i just connected - m->get_dest() == MSG_ADDR_RANK(my_rank)) { - dout(DBL) << " giving to rankserver" << endl; - rankserver.dispatch(m); - continue; - } - - // ok - entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - directory_lock.Unlock(); - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << *m - << endl; - - who->dispatch(m); - } else { - directory_lock.Unlock(); - dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl; - assert(0); - } - } - assert(stat_disq == 0); - - } - - - g_timer.shutdown(); - - dout(5) << "tcp_dispatchthread exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int tcpmessenger_start() -{ - dout(5) << "starting accept thread" << endl; - pthread_create(&listen_thread_id, - NULL, - tcp_acceptthread, - 0); - - dout(5) << "starting dispatch thread" << endl; - - // start a thread - pthread_create(&dispatch_thread_id, - NULL, - tcp_dispatchthread, - 0); - - - /* - dout(5) << "starting outgoing thread" << endl; - pthread_create(&out_thread_id, - NULL, - tcp_outthread, - 0); - */ - if (!g_conf.tcp_multi_out) - single_out_thread.create(); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -void tcpmessenger_kick_dispatch_loop() -{ - if (g_conf.tcp_multi_dispatch) { - assert(0); - // all of them - /*for (hash_map::iterator i = directory.begin(); - i != directory.end(); - i++) - i->second->dispatch_kick(); - */ - } else { - // just one - dout(DBL) << "kicking" << endl; - incoming_lock.Lock(); - dout(DBL) << "prekick" << endl; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dout(DBL) << "kicked" << endl; - } -} - -/* -void tcpmessenger_kick_outgoing_loop() -{ - outgoing_lock.Lock(); - outgoing_cond.Signal(); - outgoing_lock.Unlock(); -} -*/ - - -// wait for thread to finish - -void tcpmessenger_wait() -{ - if (g_conf.tcp_multi_dispatch) { - // new way - incoming_lock.Lock(); - while (!tcp_done) - incoming_cond.Wait(incoming_lock); - incoming_lock.Unlock(); - } else { - // old way - dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl; - tcpmessenger_kick_dispatch_loop(); - - void *returnval; - dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl; - pthread_join(dispatch_thread_id, &returnval); - dout(10) << "tcpmessenger_wait thread finished." << endl; - } -} - - - - -entity_name_t register_entity(entity_name_t addr) -{ - lookup_lock.Lock(); - - // prepare to wait - long id = ++regid; - Cond cond; - waiting_for_register_cond[id] = &cond; - - if (my_rank < 0) { - dout(DBL) << "register_entity don't know my rank, connecting" << endl; - - // connect to nameserver; discover my rank. - Message *m = new MNSConnect(listen_addr); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait for reply - while (my_rank < 0) - waiting_for_rank.Wait(lookup_lock); - assert(my_rank > 0); - } - - // send req - dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl; - Message *m = new MNSRegister(addr, my_rank, id); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait? - while (!waiting_for_register_result.count(id)) - cond.Wait(lookup_lock); - - // get result, clean up - entity_name_t entity = waiting_for_register_result[id]; - waiting_for_register_result.erase(id); - waiting_for_register_cond.erase(id); - - dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl; - - lookup_lock.Unlock(); - - // ok! - return entity; -} - - - -/*********** - * Tcpmessenger class implementation - */ - - -TCPMessenger::TCPMessenger(entity_name_t myaddr) : - Messenger(myaddr), - dispatch_thread(this) -{ - if (myaddr != MSG_ADDR_DIRECTORY) { - // register! - myaddr = register_entity(myaddr); - } - - - // my address - set_myaddr( myaddr ); - - // register myself in the messenger directory - directory_lock.Lock(); - { - directory[myaddr] = this; - - stat_num++; - if (logger) logger->set("num", stat_num); - } - directory_lock.Unlock(); - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - // g_timer.set_messenger(this); -} - - -void TCPMessenger::ready() -{ - directory_lock.Lock(); - directory_ready.insert(get_myaddr()); - directory_lock.Unlock(); - - if (get_myaddr() != MSG_ADDR_DIRECTORY) { - // started! tell namer we are up and running. - lookup_lock.Lock(); - { - Message *m = new MGenericMessage(MSG_NS_STARTED); - m->set_source(get_myaddr(), 0); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - } - lookup_lock.Unlock(); - } -} - - -TCPMessenger::~TCPMessenger() -{ - //delete logger; -} - -tcpaddr_t& TCPMessenger::get_tcpaddr() -{ - return listen_addr; -} - -void TCPMessenger::map_entity_rank(entity_name_t e, int r) -{ - lookup_lock.Lock(); - entity_rank[e] = r; - lookup_lock.Unlock(); -} - -void TCPMessenger::map_rank_addr(int r, tcpaddr_t a) -{ - lookup_lock.Lock(); - rank_addr[r] = a; - lookup_lock.Unlock(); -} - - -int TCPMessenger::get_dispatch_queue_len() -{ - return stat_inq+stat_disq; -} - - -int TCPMessenger::shutdown() -{ - dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl; - - // dont' send unregistery from nsmessenger shutdown! - if (this != nsmessenger && - (my_rank > 0 || nsmessenger)) { - dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl; - send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_DIRECTORY); - } - - // remove me from the directory - directory_lock.Lock(); - directory.erase(get_myaddr()); - - // last one? - bool lastone = directory.empty(); - //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl; - - - // or almost last one? - if (rankmessenger && directory.size() == 1) { - directory_lock.Unlock(); - tcpmessenger_stop_rankserver(); - directory_lock.Lock(); - } - - stat_num--; - if (logger) logger->set("num", stat_num); - - directory_lock.Unlock(); - - // last one? - if (lastone) { - dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl; - //pthread_t whoami = pthread_self(); - - // no more timer events - //g_timer.unset_messenger(); - - // close incoming sockets - //void *r; - for (map::iterator it = in_threads.begin(); - it != in_threads.end(); - it++) { - dout(DBL) << "closing reader on sd " << it->first << endl; - ::close(it->first); - //pthread_join(it->second, &r); - } - - if (g_conf.tcp_multi_dispatch) { - // kill off dispatch threads - dout(DBL) << "killing dispatch threads" << endl; - for (hash_map::iterator it = directory.begin(); - it != directory.end(); - it++) - it->second->dispatch_stop(); - } - - dout(DBL) << "setting tcp_done" << endl; - - // kick/kill incoming thread - incoming_lock.Lock(); - tcp_done = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - - // finish off outgoing thread - dout(10) << "waiting for outgoing to finish" << endl; - if (g_conf.tcp_multi_out) { - for (hash_map::iterator it = rank_out.begin(); - it != rank_out.end(); - it++) { - it->second->stop(); - delete it->second; - } - } else { - single_out_thread.stop(); - } - - - /* - - dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set tcp_done=true" << endl; - tcp_done = true; - } - */ - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int TCPMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - m->set_lamport_send_stamp( get_lamport() ); - - dout(4) << "--> " << m->get_type_name() - << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() - << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() - << " ---- " << m - << endl; - - // local? - TCPMessenger *entity = 0; - directory_lock.Lock(); - if (directory.count(dest) && - directory_ready.count(dest)) entity = directory[dest]; - directory_lock.Unlock(); - - if (entity) { - // local! - ::incoming_lock.Lock(); - { - dout(20) << " queueing locally for " << dest << " " << m << endl; //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl; - //assert(stat_inq == ::incoming.size()); - ::incoming.push_back(m); - ::incoming_cond.Signal(); - stat_inq++; - //assert(stat_inq == ::incoming.size()); - //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl; - stat_inqb += m->get_payload().length(); - } - ::incoming_lock.Unlock(); - } else { - // remote! - - if (g_conf.tcp_serial_marshall) - tcp_marshall(m); - - if (g_conf.tcp_serial_out) { - lookup_lock.Lock(); - // send in this thread - if (tcp_lookup(m)) - tcp_send(m); - lookup_lock.Unlock(); - } else { - lookup_lock.Lock(); - OutThread *outt = tcp_lookup(m); - lookup_lock.Unlock(); - - if (outt) outt->send(m); - } - } - - return 0; -} - - - - diff --git a/trunk/ceph/msg/TCPMessenger.h b/trunk/ceph/msg/TCPMessenger.h deleted file mode 100644 index 414e50f5fef87..0000000000000 --- a/trunk/ceph/msg/TCPMessenger.h +++ /dev/null @@ -1,115 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPMESSENGER_H -#define __TCPMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" -#include "common/Thread.h" - -#include "tcp.h" - -class Timer; - - -class TCPMessenger : public Messenger { - protected: - - //class Logger *logger; // for logging - - bool incoming_stop; - Mutex incoming_lock; - list incoming; - Cond incoming_cond; - - class DispatchThread : public Thread { - TCPMessenger *m; - public: - DispatchThread(TCPMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - - void dispatch_entry(); - -public: - void dispatch_start() { - incoming_stop = false; - dispatch_thread.create(); - } - /* void dispatch_kick() { - incoming_lock.Lock(); - incoming_cond.Signal(); - incoming_lock.Unlock(); - }*/ - void dispatch_stop() { - incoming_lock.Lock(); - incoming_stop = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dispatch_thread.join(); - } - void dispatch_queue(Message *m) { - incoming_lock.Lock(); - incoming.push_back(m); - incoming_cond.Signal(); - incoming_lock.Unlock(); - } - - public: - TCPMessenger(entity_name_t myaddr); - ~TCPMessenger(); - - void ready(); - - tcpaddr_t& get_tcpaddr(); - void map_entity_rank(entity_name_t e, int r); - void map_rank_addr(int r, tcpaddr_t a); - - int get_dispatch_queue_len(); - - void callback_kick(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta); - -extern int tcpmessenger_findns(tcpaddr_t &nsa); - -extern int tcpmessenger_init(); -extern int tcpmessenger_start(); // start thread -extern void tcpmessenger_wait(); // wait for thread to finish. -extern int tcpmessenger_shutdown(); // finalize MPI - -extern void tcpmessenger_start_nameserver(tcpaddr_t& ta); // on rank 0 -extern void tcpmessenger_stop_nameserver(); // on rank 0 -extern void tcpmessenger_start_rankserver(tcpaddr_t& ta); // on all ranks -extern void tcpmessenger_stop_rankserver(); // on all ranks - -extern int tcpmessenger_get_rank(); - - -#endif diff --git a/trunk/ceph/msg/error.c b/trunk/ceph/msg/error.c deleted file mode 100644 index 15cd16a2ca9da..0000000000000 --- a/trunk/ceph/msg/error.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -#include "include/error.h" - -#define EXIT_USAGE_ERROR -1 /* error codes for program exit */ -#define EXIT_SYSTEM_ERROR -2 -#define EXIT_GENERIC_ERROR -3 -#define MSGSIZ 1024 /* maximum error message length */ - -/* print usage error message and exit */ -void userror(const char *use, const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, "\n"); - len += snprintf(msg+len, MSGSIZ-len, use); - fprintf(stderr, "%s\n", msg); - exit(EXIT_USAGE_ERROR); - - va_end(ap); -} - -/* print system error message and exit */ -void syserror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno)); - fprintf(stderr, "%s", msg); - exit(EXIT_SYSTEM_ERROR); - - va_end(ap); -} - -/* print error message and exit */ -void exiterror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - exit(EXIT_GENERIC_ERROR); - - va_end(ap); -} - -/* print error message */ -void error(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - - va_end(ap); -} diff --git a/trunk/ceph/msg/msg_types.h b/trunk/ceph/msg/msg_types.h index 0b92df47020d0..11912582bf6cd 100644 --- a/trunk/ceph/msg/msg_types.h +++ b/trunk/ceph/msg/msg_types.h @@ -27,12 +27,13 @@ public: static const int TYPE_MDS = 2; static const int TYPE_OSD = 3; static const int TYPE_CLIENT = 4; + static const int TYPE_ADMIN = 5; static const int NEW = -1; // cons entity_name_t() : _type(0), _num(0) {} - entity_name_t(int t, int n) : _type(t), _num(n) {} + entity_name_t(int t, int n=NEW) : _type(t), _num(n) {} int num() const { return _num; } int type() const { return _type; } @@ -42,6 +43,7 @@ public: case TYPE_OSD: return "osd"; case TYPE_MON: return "mon"; case TYPE_CLIENT: return "client"; + case TYPE_ADMIN: return "admin"; default: return "unknown"; } } @@ -52,6 +54,7 @@ public: bool is_mds() const { return type() == TYPE_MDS; } bool is_osd() const { return type() == TYPE_OSD; } bool is_mon() const { return type() == TYPE_MON; } + bool is_admin() const { return type() == TYPE_ADMIN; } }; inline bool operator== (const entity_name_t& l, const entity_name_t& r) { diff --git a/trunk/ceph/osd/PG.h b/trunk/ceph/osd/PG.h index f3b00cf935f91..aa3908a5df468 100644 --- a/trunk/ceph/osd/PG.h +++ b/trunk/ceph/osd/PG.h @@ -213,11 +213,11 @@ public: eversion_t version; objectrev_t rev; - reqid_t reqid; // caller+tid to uniquely identify request + osdreqid_t reqid; // caller+tid to uniquely identify request Entry() : op(0) {} Entry(int _op, object_t _oid, const eversion_t& v, - const reqid_t& rid) : + const osdreqid_t& rid) : op(_op), oid(_oid), version(v), reqid(rid) {} bool is_delete() const { return op == DELETE; } @@ -270,7 +270,7 @@ public: class IndexedLog : public Log { public: hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; + hash_set caller_ops; // recovery pointers list::iterator requested_to; // not inclusive of referenced item @@ -288,7 +288,7 @@ public: bool logged_object(object_t oid) { return objects.count(oid); } - bool logged_req(const reqid_t &r) { + bool logged_req(const osdreqid_t &r) { return caller_ops.count(r); } diff --git a/trunk/ceph/osd/osd_types.h b/trunk/ceph/osd/osd_types.h index f8656e1f3e178..05899374ddaee 100644 --- a/trunk/ceph/osd/osd_types.h +++ b/trunk/ceph/osd/osd_types.h @@ -14,15 +14,60 @@ #ifndef __OSD_TYPES_H #define __OSD_TYPES_H -#include "include/reqid.h" -#define PG_INO 1 +#include "msg/msg_types.h" + +/* osdreqid_t - caller name + incarnation# + tid to unique identify this request + * use for metadata and osd ops. + */ +class osdreqid_t { +public: + entity_name_t name; // who + int inc; // incarnation + tid_t tid; + osdreqid_t() : inc(0), tid(0) {} + osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} +}; + +inline ostream& operator<<(ostream& out, const osdreqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const osdreqid_t& l, const osdreqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const osdreqid_t& l, const osdreqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const osdreqid_t& l, const osdreqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const osdreqid_t& l, const osdreqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const osdreqid_t& l, const osdreqid_t& r) { return !(l <= r); } +inline bool operator>=(const osdreqid_t& l, const osdreqid_t& r) { return !(l < r); } + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const osdreqid_t &r) const { + static blobhash H; + return H((const char*)&r, sizeof(r)); + } + }; +} + // osd types typedef __uint64_t coll_t; // collection id // pg stuff + +#define PG_INO 1 + typedef __uint16_t ps_t; typedef __uint8_t pruleset_t; diff --git a/trunk/ceph/osdc/Filer.h b/trunk/ceph/osdc/Filer.h index 161bfec304531..6a052601a08af 100644 --- a/trunk/ceph/osdc/Filer.h +++ b/trunk/ceph/osdc/Filer.h @@ -30,7 +30,6 @@ using namespace std; #include -#include using namespace __gnu_cxx; #include "include/types.h" diff --git a/trunk/ceph/osdc/Objecter.cc b/trunk/ceph/osdc/Objecter.cc index 9e49a43ace89b..edbc2d741f66f 100644 --- a/trunk/ceph/osdc/Objecter.cc +++ b/trunk/ceph/osdc/Objecter.cc @@ -195,7 +195,7 @@ void Objecter::kick_requests(set& changed_pgs) dout(0) << "kick_requests resub read " << tid << endl; // resubmit - readx_submit(rd, rd->ops[tid]); + readx_submit(rd, rd->ops[tid], true); rd->ops.erase(tid); } @@ -380,7 +380,7 @@ tid_t Objecter::readx(OSDRead *rd, Context *onfinish) return last_tid; } -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) +tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex, bool retry) { // find OSD PG &pg = get_pg( ex.pgid ); @@ -409,6 +409,7 @@ tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) OSD_OP_READ); m->set_length(ex.length); m->set_offset(ex.start); + m->set_retry_attempt(retry); messenger->send_message(m, osdmap->get_inst(pg.acker())); } @@ -444,7 +445,7 @@ void Objecter::handle_osd_read_reply(MOSDOpReply *m) // success? if (m->get_result() == -EAGAIN) { dout(7) << " got -EAGAIN, resubmitting" << endl; - readx_submit(rd, rd->ops[tid]); + readx_submit(rd, rd->ops[tid], true); delete m; return; } @@ -655,6 +656,7 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) tid = usetid; else tid = ++last_tid; + assert(client_inc >= 0); // add to gather set wr->waitfor_ack[tid] = ex; @@ -679,6 +681,8 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) m->set_length(ex.length); m->set_offset(ex.start); m->set_rev(ex.rev); + if (usetid > 0) + m->set_retry_attempt(true); if (wr->tid_version.count(tid)) m->set_version(wr->tid_version[tid]); // we're replaying this op! diff --git a/trunk/ceph/osdc/Objecter.h b/trunk/ceph/osdc/Objecter.h index 741db052a21ea..01a65b1be90b6 100644 --- a/trunk/ceph/osdc/Objecter.h +++ b/trunk/ceph/osdc/Objecter.h @@ -154,7 +154,7 @@ class Objecter { void handle_osd_map(class MOSDMap *m); private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex); + tid_t readx_submit(OSDRead *rd, ObjectExtent& ex, bool retry=false); tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); tid_t stat_submit(OSDStat *st); diff --git a/trunk/ceph/script/check_cache_dumps.pl b/trunk/ceph/script/check_cache_dumps.pl new file mode 100755 index 0000000000000..95bd28a474991 --- /dev/null +++ b/trunk/ceph/script/check_cache_dumps.pl @@ -0,0 +1,56 @@ +#!/usr/bin/perl + +my $epoch = shift || die "specify epoch"; + +my %auth; # mds -> id -> replica -> nonce +my %replica; # mds -> id -> auth -> nonce + +print "reading\n"; +for (my $i=0; -e "cachedump.$epoch.mds$i"; $i++) { + open(O,"cachedump.$epoch.mds$i"); + while () { + my ($name,$s); + ($name,$s) = /^\[(inode \d+) \S+ (\S+)/; + ($name,$s) = /^\[(dir \d+) \S+ (\S+)/ unless $name; + ($name,$s) = /^\[dentry (\S+) (\S+)/ unless $name; + if ($name) { + if ($s =~ /^auth/) { + $auth{$i}->{$name} = {}; + my ($rl) = $s =~ /\{(.*)\}/; + for my $r (split(/,/,$rl)) { + my ($who,$nonce) = $r =~ /(\d+)\=(\d+)/; + $auth{$i}->{$name}->{$who} = $nonce; + #print "auth $name rep by $who $nonce $s\n"; + } + } + else { + my ($a,$b,$n) = $s =~ /rep@(\d+)\,([\-\d]+)\.(\d+)/; + die $_ unless $a >= 0; + $replica{$i}->{$name}->{$a} = $n; + if ($b >= 0) { + $replica{$i}->{$name}->{$b} = $n; + } + } + } + } +} + +print "verifying replicas\n"; +for my $mds (keys %replica) { + for my $name (keys %{$replica{$mds}}) { + for my $auth (keys %{$replica{$mds}->{$name}}) { + if ($auth{$auth}->{$name}->{$mds}) { + if ($auth{$auth}->{$name}->{$mds} < $replica{$mds}->{$name}->{$auth}) { + print "problem: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; + } else { + print "ok: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; + } + } else { + print "??: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has no nonce\n"; + } + + } + } +} + + -- 2.39.5