From f1f9e9cd8022d4329c55479c35dbfade20b465df Mon Sep 17 00:00:00 2001
From: sageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Date: Tue, 6 Mar 2007 18:44:56 +0000
Subject: [PATCH] sharing of mdsmap with clients as appropriate; mds recovery
 bugfix; some mds cache documentation

git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1174 29311d96-e01e-0410-9327-a35deaab8ce9
---
 branches/sage/cephmds2/TODO                |  41 +-
 branches/sage/cephmds2/client/Client.cc    |   4 +-
 branches/sage/cephmds2/doc/Replication.txt |  19 -
 branches/sage/cephmds2/doc/caching.txt     | 418 ++++++++++++---------
 branches/sage/cephmds2/doc/performance.txt |  36 --
 branches/sage/cephmds2/include/frag.h      | 159 ++++++++
 branches/sage/cephmds2/mds/MDCache.cc      |  40 +-
 branches/sage/cephmds2/mds/MDS.cc          |  39 +-
 branches/sage/cephmds2/mds/MDS.h           |   1 +
 branches/sage/cephmds2/mds/MDSMap.h        |  17 +-
 branches/sage/cephmds2/mds/mdstypes.h      |  73 +---
 branches/sage/cephmds2/mon/MDSMonitor.cc   |  10 +-
 12 files changed, 494 insertions(+), 363 deletions(-)
 delete mode 100644 branches/sage/cephmds2/doc/Replication.txt
 delete mode 100644 branches/sage/cephmds2/doc/performance.txt
 create mode 100644 branches/sage/cephmds2/include/frag.h

diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO
index 6e22da94feedf..befb6e56aabd6 100644
--- a/branches/sage/cephmds2/TODO
+++ b/branches/sage/cephmds2/TODO
@@ -1,9 +1,7 @@
 monday
- retest with 3+
-   no failures
-   full failure
  document cache
- pg rewrite
+ tag osd ops, objects with filelayout
+ mds diropen
 
 doc
 - mdsmonitor beacon semantics
@@ -13,7 +11,7 @@ doc
 - journal content
  - importmaps and up:resolve
 - metablob version semantics
- 
+
 
 mds
 - bystanders should avoid contacting auth when it is ambiguous.
@@ -30,7 +28,6 @@ mds
 - failures during recovery stages (resolve, rejoin)... make sure rejoin still works!
 - fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything)
 - incremental mdsmaps
-- client mount logging
 - client failure
 - EMetablob should return 'expired' if they have
   higher versions (and are thus described by a newer journal entry)
@@ -44,6 +41,19 @@ mds
   - link
   - rename
 - dirslices.
+- dirslice vs readdir
+- redo hard links
+- anchortable
+- sync clients on stat
+  - will need to ditch 10s client metadata caching before this is useful
+  - implement truncate
+- statfs?
+- btree directories (for efficient large directories)
+- consistency points/snapshots
+
+- fix MExportAck and others to use dir+dentry, not inode
+  (otherwise this all breaks with hard links.. altho it probably needs reworking already?)
+
 
 
 monitor
@@ -198,31 +208,12 @@ mds
 
 client
 - fstat
-- make_request: cope with mds failure
 - mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate..
 - test client caps migration w/ mds exports
 - some heuristic behavior to consolidate caps to inode auth?
 
 
 
-MDS TODO
-- fix hashed readdir: should (optionally) do a lock on dir namespace?
-- fix hard links
-  - they mostly work, but they're fragile
-- sync clients on stat
-  - will need to ditch 10s client metadata caching before this is useful
-  - implement truncate
-- implement hashed directories
-- statfs?
-- rewrite journal + recovery
-- figure out online failure recovery
-- more distributed fh management?
-- btree directories (for efficient large directories)
-- consistency points/snapshots
-
-- fix MExportAck and others to use dir+dentry, not inode
-  (otherwise this all breaks with hard links.. altho it probably needs reworking already?)
-
 
 
 
diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc
index cef5f6129ef87..ecdf82193be27 100644
--- a/branches/sage/cephmds2/client/Client.cc
+++ b/branches/sage/cephmds2/client/Client.cc
@@ -643,13 +643,13 @@ void Client::handle_client_request_forward(MClientRequestForward *fwd)
     request->mds.insert(fwd->get_source().num());
     request->mds.insert(fwd->get_dest_mds());
     request->num_fwd = fwd->get_num_fwd();
-    dout(-10) << "handle_client_request tid " << tid
+    dout(10) << "handle_client_request tid " << tid
 	     << " fwd " << fwd->get_num_fwd() 
 	     << " to mds" << fwd->get_dest_mds() 
 	     << ", mds set now " << request->mds
 	     << endl;
   } else {
-    dout(-10) << "handle_client_request tid " << tid
+    dout(10) << "handle_client_request tid " << tid
 	     << " previously forwarded to mds" << fwd->get_dest_mds() 
 	     << ", mds still " << request->mds
 	     << endl;
diff --git a/branches/sage/cephmds2/doc/Replication.txt b/branches/sage/cephmds2/doc/Replication.txt
deleted file mode 100644
index 0f8d4c9079e4d..0000000000000
--- a/branches/sage/cephmds2/doc/Replication.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-
-Primary copy replication.
-
-Inodes:
-
-- The primary's list of replicas (cached_by) is inclusive at all times.
-- The primary's list never includes the local node.
-- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight.
-
-- Replicas can be created in two ways:
-  - via a Discover + DiscoverReply
-  - via an export and import.  (The old auth keeps a copy, and adds itself to the replica list as it exports.)
-
-
-Directories (and their dentries):
-
-- The primary has an open_by list that is inclusive at all times.
-- ..Never includes local node
-- No per-dentry replica lists.  All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list.
\ No newline at end of file
diff --git a/branches/sage/cephmds2/doc/caching.txt b/branches/sage/cephmds2/doc/caching.txt
index a2791bdb5fbfa..fe0c78331bd86 100644
--- a/branches/sage/cephmds2/doc/caching.txt
+++ b/branches/sage/cephmds2/doc/caching.txt
@@ -1,234 +1,302 @@
 
+SPANNING TREE PROPERTY
+
+All metadata that exists in the cache is attached directly or
+indirectly to the root inode.  That is, if the /usr/bin/vi inode is in
+the cache, then /usr/bin, /usr, and / are too, including the inodes,
+directory objects, and dentries.
+
 
 AUTHORITY
 
 The authority maintains a list of what nodes cache each inode.
-Additionally, each replica is assigned a serial (normally 0) to
+Additionally, each replica is assigned a nonce (initial 0) to
 disambiguate multiple replicas of the same item (see below).
 
-  set<int> cached_by;
-  map<int, int> cached_by_serial;
+  map<int, int> replicas;  // maps replicating mds# to nonce
 
 The cached_by set _always_ includes all nodes that cache the
-partcuarly inode, but may additionally include nodes that used to
+partcuarly object, but may additionally include nodes that used to
 cache it but no longer do.  In those cases, an expire message should
-be in transit.
-
-
-REPLICA
-
-The replica maintains a notion of who it believes is the authority for
-each replicated inode.  There are two possibilities:
-
- - Ordinarily, this notion is correct.  
- - If the part of the file system in question was recently exported to
-   a new MDS, the inodes old authority is acting as a CACHEPROXY,
-   and will forward relevant messages on to the authority.
-
-When a repica is expired from cache, and expire is sent to the
-authority.  The expire includes the serial number issued when the
-replica was originally created to disambiguate potentially concurrent
-replication activity.
-
-
-EXPORTS 
-
-- The old authority suddenly becomes a replica.  It's serial is well
-  defined.  It also becomes a CACHEPROXY, which means its cached_by
-  remains defined (with an alternate meaning!).  While a proxy, the
-  node will forward relevant messages from the replica to the
-  authority (but not the other way around--the authority knows all
-  replicas).  
-
-- Once the export is acked, the old authority sends a
-  message to the replica notifying it of the new authority.  As soon
-  as all replicas acknowedge receipt of this notice, the old authority
-  can cease CACHEPROXY responsibilities and become a regular replica.
-  At this point it's cached_by is no longer defined.
-
-- Replicas always know who the authority for the inode is, OR they
-  know prior owner acting as a CACHEPROXY.  (They don't know which it
-  is.)
-
-
-CACHED_BY
-
-The authority always has an inclusive list of nodes who cache an item.
-As such it can confidently send updates to replicas for locking,
-invalidating, etc.  When a replica is expired from cache, an expire is
-sent to the authority.  If the serial matches, the node is removed
-from the cached_by list.
+be in transit.  That is, we have two invariants:
 
+ 1) the authority's replica set will always include all actual
+    replicas, and
 
+ 2) cache expiration notices will be reliably delivered to the
+    authority.
 
+The second invariant is particularly important because the presence of
+replicas will pin the metadata object in memory on the authority,
+preventing it from being trimmed from the cache.  Notification of
+expiration of the replicas is required to allow previously replicated
+objects from eventually being trimmed from the cache as well.
 
+Each metdata object has a authority bit that indicates whether it is
+authoritative or a replica.  
 
-SUBTREE AUTHORITY DELEGATION: imports versus hashing
 
-Authority is generally defined recursively: an inode's authority
-matches the containing directory, and a directory's authority matches
-the directory inode's.  Thus the authority delegation chain can be
-broken/redefined in two ways:
+REPLICA NONCE
+
+Each replicated object maintains a "nonce" value, issued by the
+authority at the time the replica was created.  If the authority has
+already created a replica for the given MDS, the new replica will be
+issues a new (incremented) nonce.  This nonce is attached
+to cache expirations, and allows the authority to disambiguate
+expirations when multiple replicas of the same object are created and
+cache expiration is coincident with replication.  That is, when an
+old replica is expired from the replicating MDS at the same time that
+a new replica is issued by the authority and the resulting messages
+cross paths, the authority can tell that it was the old replica that
+was expired and effectively ignore the expiration message.  The
+replica is removed from the replicas map only if the nonce matches.
 
- - Imports and exports redefine the directory inode -> directory
-   linkage, such that the directory authority is explicitly specified
-   via dir.dir_auth:
 
-      dir.dir_auth == -1  -> directory matches its inode
-      dir.dir_auth >= 0   -> directory authority is dir.dir_auth
+SUBTREE PARTITION
 
- - Hashed directories redefine the directory -> inode linkage.  In
-   non-hashed directories, inodes match their containing directory.
-   In hashed directories, each dentry's authority is defined by a hash
-   function.
+Authority of the file system namespace is partitioned using a
+subtree-based partitioning strategy.  This strategy effectively
+separates directory inodes from directory contents, such that the
+directory contents are the unit of redelegation.  That is, if / is
+assigned to mds0 and /usr to mds1, the inode for /usr will be managed
+by mds0 (it is part of the / directory), while the contents of /usr
+(and everything nested beneath it) will be managed by mds1.
 
-      inode.hash_seed == 0  -> inode matches containing directory
-      inode.hash_seed >  0  -> defined by hash(hash_seed, dentry)
+The description for this partition exists solely in the collective
+memory of the MDS cluster and in the individual MDS journals.  It is
+not described in the regular on-disk metadata structures.  This is
+related to the fact that authority delegation is a property of the
+{\it directory} and not the directory's {\it inode}.
 
-A directory's "containing_import" (bad name, FIXME) is either the
-import or hashed directory that is responsible for delegating a
-subtree.  Note that the containing_import of a directory may be itself
-because it is an import, but it cannot be itself because it is hashed.
+Subsequently, if an MDS is authoritative for a directory inode and does
+not yet have any state associated with the directory in its cache,
+then it can assume that it is also authoritative for the directory.
 
-Thus:
+Directory state consists of a data object that describes any cached
+dentries contained in the directory, information about the
+relationship between the cached contents and what appears on disk, and
+any delegation of authority.  That is, each CDir object has a dir_auth
+element.  Normally dir_auth has a value of AUTH_PARENT, meaning that
+the authority for the directory is the same as the directory's inode.
+When dir_auth specifies another metadata server, that directory is
+point of authority delegation and becomes a {\it subtree root}.  A
+CDir is a subtree root iff its dir_auth specifies an MDS id (and is not
+AUTH_PARENT).
 
- - Import and export operations' manipulation of dir_auth is
-   completely orthogonal to hashing operations.  Hashing methods can
-   ignore dir_auth, except when they create imports/exports (and break
-   the inode<->dir auth linkage).
+ - A dir is a subtree root iff dir_auth != AUTH_PARENT.  
 
- - Hashdirs act sort of like imports in that they bound an
-   authoritative region.  That is, either hashdirs or imports can be
-   the key for nested_exports.  In some cases, a dir may be both an
-   import and a hash.
+ - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the
+   converse may not be true.
 
- - Export_dir won't export a hashdir.  This is because it's tricky
-   (tho not necessarily impossible) due to the way nested_exports is
-   used with imports versus hashdirs.
+The authority for any metadata object in the cache can be determined
+by following the parent pointers toward the root until a subtree root
+CDir object is reached, at which point the authority is specified by
+its dir_auth.
 
+Each MDS cache maintains a subtree data structure that describes the
+subtree partition for all objects currently in the cache:
 
+  map< CDir*, set<CDir*> > subtrees;
 
+ - A dir will appear in the subtree map (as a key) IFF it is a subtree
+   root.
 
-FREEZING
+Each subtree root will have an entry in the map.  The map value is a
+set of all other subtree roots nested beneath that point.  Nested
+subtree roots effectively bound or prune a subtree.  For example, if
+we had the following partition:
 
-There are two types of freezing:
+ mds0 /
+ mds1 /usr
+ mds0 /usr/local
+ mds0 /home
 
- - TREE: recursively freezes everything nested beneath a directory,
-   until an export of edge of cache is reached.  
- - DIR: freezes the contents of a single directory.
+The subtree map on mds0 would be
 
-Some notes:
+ /     -> (/usr, /home)
+ /home -> ()
 
- - Occurs on the authoritative node only.
+and on mds1:
 
- - Used for suspending critical operations while migrating authority
-   between nodes or hashing/unhashing directories.
+ /usr  -> (/usr/local)
 
- - Freezes the contents of the cache such that items may not be added,
-   items cannot be auth pinned, and/or subsequently reexported.  The
-   namespace of the affected portions of the hierarchy may not change.
-   The content of inodes and other orthogonal operations
-   (e.g. replication, inode locking and modification) are unaffected.
 
-Two states are defined: freezing and frozen.  The freezing state is
-used while waiting for auth_pins to be removed.  Once all auth_pins
-are gone, the state is changed to frozen.  New auth_pins cannot be
-added while freezing or frozen.
+AMBIGUOUS DIR_AUTH
+
+While metadata for a subtree is being migrated between two MDS nodes,
+the dir_auth for the subtree root is allowed to be ambiguous.  That
+is, it will specify both the old and new MDS ids, indicating that a
+migration is in progress.  
+
+If a replicated metadata object is expired from the cache from a
+subtree whose authority is ambiguous, the cache expiration is sent to
+both potential authorities.  This ensures that the message will be
+reliably delivered, even if either of those nodes fails.  A number of
+alternative strategies were considered.  Sending the expiration to the
+old or new authority and having it forwarded if authority has been
+delegated can result in message loss if the forwarding node fails.
+Pinning ambiguous metadata in cache is computationally expensive for
+implementation reasons, and while delaying the transmission of expiration
+messages is difficult to implement because the replicating must send
+the final expiration messages when the subtree authority is
+disambiguated, forcing it to keep certain elements of it cache in
+memory.  Although duplicated expirations incurs a small communications
+overhead, the implementation is much simpler.
 
 
 AUTH PINS
 
-An auth pin keeps a given item on the authoritative node until it is
-removed.  The pins are tracked recursively, so that a subtree cannot
-be frozen if it contains any auth pins.
-
-If a pin is placed on a non-authoritative item, the item is allowed to
-become authoritative; the specific restriction is it cannot be frozen,
-which only happens during export-type operations.
-
-
-TYPES OF EXPORTS
-
-- Actual export of a subtree from one node to another
-- A rename between directories on different nodes exports the renamed
-_inode_.  (If it is a directory, it becomes an export such that the
-directory itself does not move.)
-- A hash or unhash operation will migrate inodes within the directory
-either to or from the directory's main authority.
-
-EXPORT PROCESS
-
-
+Most operations that modify metadata must allow some amount of time to
+pass in order for the operation to be journaled or for communication
+to take place between the object's authority and any replicas.  For
+this reason it must not only be pinned in the authority's metadata
+cache, but also be locked such that the object's authority is not
+allowed to change until the operation completes.  This is accomplished
+using {\it auth pins}, which increment a reference counter on the
+object in question, as well as all parent metadata objects up to the
+root of the subtree.  As long as the pin is in place, it is impossible
+for that subtree (or any fragment of it that contains one or more
+pins) to be migrated to a different MDS node.  Pins can be placed on
+both inodes and directories.
 
+Auth pins can only exist for authoritative metadata, because they are
+only created if the object is authoritative, and their presense
+prevents the migration of authority.
 
-HASHING
 
-- All nodes discover and open directory
-
-- Prep message distributes subdir inode replicas for exports so that
-  peers can open those dirs.  This is necessary because subdirs are
-  converted into exports or imports as needed to avoid migrating
-  anything except the hashed dir itself.  The prep is needed for the
-  same reasons its important with exports: the inode authority must
-  always have the exported dir open so that it gets accurate dir
-  authority updates, and can keep the inode->dir_auth up to date.
-
-- MHashDir messsage distributes the directory contents.
-
-- While auth is frozen_dir, we can't get_or_open_dir.  Otherwise the
-  Prep messages won't be inclusive of all dirs, and the
-  imports/exports won't get set up properly.
-
-TODO
-readdir
-
-
-- subtrees stop at hashed dir.  hashed dir's dir_auth follows parent
-  subtree, unless the dir is also an explicit import.  thus a hashed
-  dir can also be an import dir.  
+FREEZING
 
+More specifically, auth pins prevent a subtree from being frozen.
+When a subtree is frozen, all updates to metadata are forbidden.  This
+includes updates to the replicas map that describes which replicas
+(and nonces) exist for each object.
+
+In order for metadata to be migrated between MDS nodes, it must first
+be frozen.  The root of the subtree is initially marked as {\it
+freezing}.  This prevents the creation of any new auth pins within the
+subtree.  After all existing auth pins are removed, the subtree is
+then marked as {\it frozen}, at which point all updates are
+forbidden.  This allows metadata state to be packaged up in a message
+and transmitted to the new authority, without worrying about
+intervening updates.
+
+If the directory at the base of a freezing or frozen subtree is not
+also a subtree root (that is, it has dir_auth == AUTH_PARENT), the
+directory's parent inode is auth pinned.  
+
+ - a frozen tree root dir will auth_pin its inode IFF it is auth AND
+   not a subtree root.
+
+This prevents a parent directory from being concurrently frozen, and a
+range of resulting implementation complications relating metadata
+migration.
+
+
+CACHE EXPIRATION FOR FROZEN SUBTREES
+
+Cache expiration messages that are received for a subtree that is
+frozen are temporarily set aside instead of being processed.  Only
+when the subtree is unfrozen are the expirations either processed (if
+the MDS is authoritative) or discarded (if it is not).  Because either
+the exporting or importing metadata can fail during the migration
+process, the MDS cannot tell whether it will be authoritative or not
+until the process completes.
+
+During a migration, the subtree will first be frozen on both the
+exporter and importer, and then all other replicas will be informed of
+a subtrees ambiguous authority.  This ensures that all expirations
+during migration will go to both parties, and nothing will be lost in
+the event of a failure.
+
+
+
+
+NORMAL MIGRATION
+
+The exporter begins by doing some checks in export_dir() to verify
+that it is permissible to export the subtree at this time.  In
+particular, the cluster must not be degraded, the subtree root may not
+be freezing or frozen, and the path must be pinned (\ie not conflicted
+with a rename).  If these conditions are met, the subtree root
+directory is temporarily auth pinned, the subtree freeze is initiated,
+and the exporter is committed to the subtree migration, barring an
+intervening failure of the importer or itself.
+
+The MExportDiscover serves simply to ensure that the inode for the
+base directory being exported is open on the destination node.  It is
+pinned by the importer to prevent it from being trimmed.  This occurs
+before the exporter completes the freeze of the subtree to ensure that
+the importer is able to replicate the necessary metadata.  When the
+exporter receives the MDiscoverAck, it allows the freeze to proceed by
+removing its temporary auth pin.
+
+The MExportPrep message then follows to populate the importer with a
+spanning tree that includes all dirs, inodes, and dentries necessary
+to reach any nested subtrees within the exported region.  This
+replicates metadata as well, but it is pushed out by the exporter,
+avoiding deadlock with the regular discover and replication process.
+The importer is responsible for opening the bounding directories from
+any third parties authoritative for those subtrees before
+acknowledging.  This ensures that the importer has correct dir_auth
+information about where authority is redelegated for all points nested
+beneath the subtree being migrated.  While processing the MExportPrep,
+the importer freezes the entire subtree region to prevent any new
+replication or cache expiration.
+
+A warning stage occurs only if the base subtree directory is open by
+nodes other than the importer and exporter.  If it is not, then this
+implies that no metadata within or nested beneath the subtree is
+replicated by any node other than the importer an exporter.  If it is,
+then a MExportWarning message informs any bystanders that the
+authority for the region is temporarily ambiguous, and lists both the
+exporter and importer as authoritative MDS nodes.  In particular,
+bystanders who are trimming items from their cache must send
+MCacheExpire messages to both the old and new authorities.  This is
+necessary to ensure that the surviving authority reliably receives all
+expirations even if the importer or exporter fails.  While the subtree
+is frozen (on both the importer and exporter), expirations will not be
+immediately processed; instead, they will be queued until the region
+is unfrozen and it can be determined that the node is or is not
+authoritative.
+
+The exporter walks the subtree hierarchy and packages up an MExport
+message containing all metadata and important state (\eg, information
+about metadata replicas).  At the same time, the expoter's metadata
+objects are flagged as non-authoritative.  The MExport message sends
+the actual subtree metadata to the importer.  Upon receipt, the
+importer inserts the data into its cache, marks all objects as
+authoritative, and logs a copy of all metadata in an EImportStart
+journal message.  Once that has safely flushed, it replies with an
+MExportAck.  The exporter can now log an EExport journal entry, which
+ultimately specifies that the export was a success.  In the presence
+of failures, it is the existence of the EExport entry only that
+disambiguates authority during recovery.
+
+Once logged, the exporter will send an MExportNotify to any
+bystanders, informing them that the authority is no longer ambiguous
+and cache expirations should be sent only to the new authority (the
+importer).  Once these are acknowledged back to the exporter,
+implicitly flushing the bystander to exporter message streams of any
+stray expiration notices, the exporter unfreezes the subtree, cleans
+up its migration-related state, and sends a final MExportFinish to the
+importer.  Upon receipt, the importer logs an EImportFinish(true)
+(noting locally that the export was indeed a success), unfreezes its
+subtree, processes any queued cache expierations, and cleans up its
+state.
 
-bananas
-apples
-blueberries
-green pepper
-carrots
-celery
 
+PARTIAL FAILURE RECOVERY
 
 
 
-dir is a subtree root iff dir_auth.first != parent.
-if dir_auth.first = parent then inode auth == dir auth, but the converse may not be true.  
-that is, you may have inode auth = dir auth, but dir_auth.first = whoami.  e.g.,
-  /usr is import.
-  /usr/bin is export.
-  /usr frozen for export, with bound /usr/bin.
-  /usr/bin imports completely.  /usr/bin.dir_auth = whoami, not parent, because inode is not authpinnable.
-this is safe because the /usr bound is known, and an abort can adjust the bound's dir_auth.
 
+RECOVERY FROM JOURNAL
 
-- if i am auth, any subtree bound will be a subtree root, and an export, frozen, or both.
-- if i am auth and unfrozen/freezing, any subtree bound will be an export.  and subtree root.
-- if i am auth and frozen, any subtree bound will be an export, or subtree root noted in export_bounds/import_bounds.
 
-- if a dir is a subtree root, it is
-  - auth, import
-  - auth, export
-  - nonauth, frozen, importing
-  - auth, frozen, imported
-  - auth, parent is auth+frozen for import|export, i am known bound.
-  - auth, parent is auth+frozen for import
-  - frozen and exporting
-  
 
-- a frozen tree root dir will auth_pin it's inode IFF it is auth AND not a subtree root.
 
 
---------------------------
 
-dir is a subtree root iff dir_auth.first != parent.
 
-if subtree root and not root, will appear in subtree_bounds[parent subtree root].
 
 
diff --git a/branches/sage/cephmds2/doc/performance.txt b/branches/sage/cephmds2/doc/performance.txt
deleted file mode 100644
index 7ca278bd284b1..0000000000000
--- a/branches/sage/cephmds2/doc/performance.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-quick performance test 2005-05-11.  fakemds, 100x100, asdf/asdf, debug 13
- -g marshalling
-real    3m8.697s
-user    2m53.282s
-sys     0m6.291s
-
-real    3m3.337s
-user    2m49.467s
-sys     0m6.243s
-
- -g no marshalling
-real    2m1.464s
-user    1m42.680s
-sys     0m8.128s
-
-real    1m49.469s
-user    1m34.523s
-sys     0m6.410s
-
- -O3 marshalling
-real    1m29.833s
-user    1m11.474s
-sys     0m7.588s
-
-real    1m9.439s
-user    0m56.071s
-sys     0m5.643s
-
-
- -O3 no marshalling
-real    1m2.739s
-user    0m46.578s
-sys     0m7.882s
-
diff --git a/branches/sage/cephmds2/include/frag.h b/branches/sage/cephmds2/include/frag.h
new file mode 100644
index 0000000000000..b92de3ca8c1ea
--- /dev/null
+++ b/branches/sage/cephmds2/include/frag.h
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __FRAG_H
+#define __FRAG_H
+
+#include <map>
+#include <list>
+#include "buffer.h"
+
+/*
+ * 
+ * the goal here is to use a binary split strategy to partition a namespace.  
+ * frag_t represents a particular fragment.  bits() tells you the size of the
+ * fragment, and value() it's name.  this is roughly analogous to an ip address
+ * and netmask.
+ * 
+ * fragtree_t represents an entire namespace and it's partition.  it essentially 
+ * tells you where fragments are split into other fragments, and by how much 
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ * 
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ * presumably we are partitioning the output of a (perhaps specialized) hash 
+ * function.
+ * 
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment.  that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace.  if we bisect that,
+ * we get 0/1 and 1/1.  quartering gives us 0/2, 1/2, 2/2, 3/2.  and so on.
+ */
+class frag_t {
+  /* encoded value.
+   *  8 upper bits = "bits"
+   * 24 lower bits = "value"
+   */
+  __uint32_t _enc;  
+  
+ public:
+  frag_t() : _enc(0) { }
+  frag_t(unsigned v, unsigned b) : _enc((b << 24) + v) { }
+  
+  // accessors
+  unsigned value() const { return _enc & 0xffffff; }
+  unsigned bits() const { return _enc >> 24; }
+  unsigned mask() const { return 0xffffffff >> (32-bits()); }
+  operator unsigned() const { return _enc; }
+
+  // tests
+  bool contains(frag_t sub) const {
+    return (sub.bits() >= bits() &&             // they are more specific than us,
+	    (sub.value() & mask()) == value()); // and they are contained by us.
+  }
+  bool root() const { 
+    return bits() == 0; 
+  }
+  frag_t parent() const {
+    assert(bits() > 0);
+    return frag_t(value() & (mask() >> 1), bits()-1);
+  }
+
+  // splitting
+  frag_t left_half() const {
+    return frag_t(value(), bits()+1);
+  }
+  frag_t right_half() const {
+    return frag_t(value() | (1<<bits()), bits()+1);
+  }
+  void split(int nb, list<frag_t>& frag_tments) const {
+    assert(nb > 0);
+    unsigned nway = 1 << (nb-1);
+    for (unsigned i=0; i<nway; i++) 
+      frag_tments.push_back( frag_t(value() | (i << (bits()+nb-1)), bits()+nb) );
+  }
+};
+
+inline ostream& operator<<(ostream& out, frag_t& hb)
+{
+  return out << "hb(" << hex << hb.value() << dec << "/" << hb.bits() << ")";
+}
+
+
+/**
+ * fragtree_t
+ *
+ * partition for an entire namespace.
+ */
+class fragtree_t {
+  // pairs <f, b>:
+  //  frag_t f is split by b bits.
+  //  if child frag_t does not appear, it is not split.
+  std::map<frag_t,int> _splits;  
+
+ public:
+  // accessors
+  int get_split(frag_t hb) {
+    if (_splits.count(hb))
+      return _splits[hb];
+    else
+      return 0;
+  }
+  
+  // modifiers
+  void split(frag_t hb, int b) {
+    assert(_splits.count(hb) == 0);
+    _splits[hb] = b;
+  }
+  void merge(frag_t hb, int b) {
+    assert(_splits[hb] == b);
+    _splits.erase(hb);
+  }
+
+  // verify that we describe a legal partition of the namespace.
+  void verify() {
+    std::map<frag_t,int> copy;
+    std::list<frag_t> q;
+    q.push_back(frag_t());
+    
+    while (1) {
+      frag_t cur = q.front();
+      q.pop_front();
+      int b = get_split(cur);
+      if (!b) continue;
+      copy[cur] = b;
+      cur.split(b, q);
+    }
+    
+    assert(copy == _splits);	
+  }
+  
+  // encoding
+  void _encode(bufferlist& bl) {
+    ::_encode(_splits, bl);
+  }
+  void _decode(bufferlist& bl, int& off) {
+    ::_decode(_splits, bl, off);
+  }
+};
+
+#endif
diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc
index 9ee894224ebe6..771a73532f96b 100644
--- a/branches/sage/cephmds2/mds/MDCache.cc
+++ b/branches/sage/cephmds2/mds/MDCache.cc
@@ -993,33 +993,35 @@ void MDCache::handle_import_map(MMDSImportMap *m)
     }
   }
 
-  // note ambiguous imports too.. unless i'm already active
-  if (!mds->is_active() && !mds->is_stopping()) {
+  show_subtrees();
+
+
+  // recovering?
+  if (!mds->is_rejoin() && !mds->is_active() && !mds->is_stopping()) {
+    // note ambiguous imports too.. unless i'm already active
     for (map<inodeno_t, list<inodeno_t> >::iterator pi = m->ambiguous_imap.begin();
 	 pi != m->ambiguous_imap.end();
 	 ++pi) {
       dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << endl;
       other_ambiguous_imports[from][pi->first].swap( pi->second );
     }
-  }
 
-  show_subtrees();
-
-  // did i get them all?
-  got_import_map.insert(from);
-  
-  if (got_import_map == recovery_set) {
-    dout(10) << "got all import maps, ready to rejoin" << endl;
-    disambiguate_imports();
-    recalc_auth_bits();
-    trim_non_auth(); 
+    // did i get them all?
+    got_import_map.insert(from);
     
-    // move to rejoin state
-    mds->set_want_state(MDSMap::STATE_REJOIN);
-    
-  } else {
-    dout(10) << "still waiting for more importmaps, got " << got_import_map 
-	     << ", need " << recovery_set << endl;
+    if (got_import_map == recovery_set) {
+      dout(10) << "got all import maps, ready to rejoin" << endl;
+      disambiguate_imports();
+      recalc_auth_bits();
+      trim_non_auth(); 
+      
+      // move to rejoin state
+      mds->set_want_state(MDSMap::STATE_REJOIN);
+      
+    } else {
+      dout(10) << "still waiting for more importmaps, got " << got_import_map 
+	       << ", need " << recovery_set << endl;
+    }
   }
 
   delete m;
diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc
index 227e52f644338..71f2148e32afe 100644
--- a/branches/sage/cephmds2/mds/MDS.cc
+++ b/branches/sage/cephmds2/mds/MDS.cc
@@ -93,6 +93,9 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) {
   server = new Server(this);
   locker = new Locker(this, mdcache);
 
+
+  // clients
+  last_client_mdsmap_bcast = 0;
   
   // beacon
   beacon_last_seq = 0;
@@ -106,7 +109,6 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) {
 
   want_state = state = MDSMap::STATE_DNE;
 
-
   logger = logger2 = 0;
 
   // i'm ready!
@@ -440,6 +442,10 @@ void MDS::handle_mds_map(MMDSMap *m)
   mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED);
   set<int> oldactive;
   mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
+  set<int> oldcreating;
+  mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING);
+  set<int> oldout;
+  mdsmap->get_mds_set(oldout, MDSMap::STATE_OUT);
 
   // decode and process
   mdsmap->decode(m->get_encoded());
@@ -571,21 +577,17 @@ void MDS::handle_mds_map(MMDSMap *m)
     }
   }
   
+  // we need to make sure clients find out about (new) mds addresses.
+  // clients don't care about mds state.
+  bool share_with_clients = false;
+
   // REJOIN
   // is everybody finally rejoining?
   if (is_rejoin() || is_active() || is_stopping()) {
     // did we start?
     if (!wasrejoining && mdsmap->is_rejoining()) {
       mdcache->send_cache_rejoins();
-
-      // share the map with mounted clients
-      dout(10) << "sharing mdsmap with mounted clients" << endl;
-      for (set<int>::const_iterator p = clientmap.get_mount_set().begin();
-	   p != clientmap.get_mount_set().end();
-	   ++p) {
-	messenger->send_message(new MMDSMap(mdsmap),
-				clientmap.get_inst(*p));
-      }
+      share_with_clients = true;
     }
     // did we finish?
     if (wasrejoining && !mdsmap->is_rejoining()) {
@@ -616,6 +618,23 @@ void MDS::handle_mds_map(MMDSMap *m)
     }
   }
 
+  // inst set changed?
+  if (mdsmap->get_same_inst_since() > last_client_mdsmap_bcast) 
+    share_with_clients = true;
+ 
+  // share map with clients?
+  if (share_with_clients) {
+    // share the map with mounted clients
+    dout(10) << "sharing mdsmap with mounted clients" << endl;
+    for (set<int>::const_iterator p = clientmap.get_mount_set().begin();
+	 p != clientmap.get_mount_set().end();
+	 ++p) {
+      messenger->send_message(new MMDSMap(mdsmap),
+			      clientmap.get_inst(*p));
+    }
+    last_client_mdsmap_bcast = mdsmap->get_epoch();
+  }
+
   delete m;
 }
 
diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h
index 98d1785fd3cbe..fa2aa536cc7f3 100644
--- a/branches/sage/cephmds2/mds/MDS.h
+++ b/branches/sage/cephmds2/mds/MDS.h
@@ -184,6 +184,7 @@ class MDS : public Dispatcher {
 
   // -- client map --
   ClientMap    clientmap;
+  epoch_t      last_client_mdsmap_bcast;
   void log_clientmap(Context *c);
 
 
diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h
index e146b3029fdf3..f19ee448acfc2 100644
--- a/branches/sage/cephmds2/mds/MDSMap.h
+++ b/branches/sage/cephmds2/mds/MDSMap.h
@@ -65,7 +65,8 @@ class MDSMap {
 
  protected:
   epoch_t epoch;
-  utime_t ctime;
+  utime_t created;
+  epoch_t same_inst_since;
 
   int anchortable;   // which MDS has anchortable (fixme someday)
   int root;          // which MDS has root directory
@@ -79,12 +80,13 @@ class MDSMap {
   friend class MDSMonitor;
 
  public:
-  MDSMap() : epoch(0), anchortable(0), root(0) {}
+  MDSMap() : epoch(0), same_inst_since(0), anchortable(0), root(0) {}
 
   epoch_t get_epoch() const { return epoch; }
   void inc_epoch() { epoch++; }
 
-  const utime_t& get_ctime() const { return ctime; }
+  const utime_t& get_create() const { return created; }
+  epoch_t get_same_inst_since() const { return same_inst_since; }
 
   int get_anchortable() const { return anchortable; }
   int get_root() const { return root; }
@@ -249,7 +251,8 @@ class MDSMap {
   // serialize, unserialize
   void encode(bufferlist& blist) {
     blist.append((char*)&epoch, sizeof(epoch));
-    blist.append((char*)&ctime, sizeof(ctime));
+    blist.append((char*)&created, sizeof(created));
+    blist.append((char*)&same_inst_since, sizeof(same_inst_since));
     blist.append((char*)&anchortable, sizeof(anchortable));
     blist.append((char*)&root, sizeof(root));
     
@@ -263,8 +266,10 @@ class MDSMap {
     int off = 0;
     blist.copy(off, sizeof(epoch), (char*)&epoch);
     off += sizeof(epoch);
-    blist.copy(off, sizeof(ctime), (char*)&ctime);
-    off += sizeof(ctime);
+    blist.copy(off, sizeof(created), (char*)&created);
+    off += sizeof(created);
+    blist.copy(off, sizeof(same_inst_since), (char*)&same_inst_since);
+    off += sizeof(same_inst_since);
     blist.copy(off, sizeof(anchortable), (char*)&anchortable);
     off += sizeof(anchortable);
     blist.copy(off, sizeof(root), (char*)&root);
diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h
index 750d928a1d68d..c2457d6077154 100644
--- a/branches/sage/cephmds2/mds/mdstypes.h
+++ b/branches/sage/cephmds2/mds/mdstypes.h
@@ -13,6 +13,7 @@ using namespace std;
 
 #include <cassert>
 
+#include "include/frag.h"
 
 
 // md ops
@@ -169,76 +170,8 @@ inline mds_load_t operator/( mds_load_t& a, double d )
 // dir slices
 
 struct dirslice_t {
-  short hash_mask;
-  short hash_val;
-};
-
-/*
- * hashbit
- */
-class hashbit {
-  unsigned _value:24;
-  unsigned _bits:8;
-
- public:
-  hashbit() :
-	_value(0), _bits(0) { }
-  hashbit(unsigned v, unsigned b) :
-	_value(v), _bits(b) { }
-
-  // accessors
-  unsigned value() const { return _value; }
-  unsigned bits() const { return _bits; }
-  unsigned mask() const { return 0xffffffff >> (32-_bits); }
-  operator unsigned() const { 
-	return (_bits << 24) + _value;
-  }
-  
-  // tests
-  bool contains(hashbit sub) {
-	return (sub.bits() >= bits() &&             // they are more specific than us,
-			(sub.value() & mask()) == value()); // and they are contained by us.
-  }
-
-  // splitting
-  hashbit left_half() {
-	return hashbit(_value, _bits+1);
-  }
-  hashbit right_half() {
-	return hashbit(_value | (1<<_bits), _bits+1);
-  }
-  void split(int nb, vector<hashbit>& fragments) {
-	assert(nb > 0);
-	unsigned nway = 1 << (nb-1);
-	fragments.clear();
-	fragments.reserve(nway);
-	for (unsigned i=0; i<nway; i++) 
-	  fragments.push_back( hashbit(_value | (i << (_bits+nb-1)), _bits+nb) );
-  }
-};
-
-//bool operator<(const hashbit& l, const hashbit& r) { return (unsigned)l < (unsigned)r; }
-inline ostream& operator<<(ostream& out, hashbit& hb)
-{
-  return out << "hb(" << hex << hb.value() << dec << "/" << hb.bits() << ")";
-}
-
-class hashtree {
-  // pairs <hb, n>:
-  //  hashbit hb is split n ways.
-  //  if child hashbit does not appear, it is not split.
-  map<hashbit,int> _splits;  
-
- public:
-  void split(hashbit hb, int n) {
-	assert(_splits.count(hb) == 0);
-	_splits[hb] = n;
-  }
-  void merge(hashbit hb, int n) {
-	assert(_splits[hb] == n);
-	_splits.erase(hb);
-  }
-
+  inodeno_t ino;
+  frag_t fg;
 };
 
 
diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc
index 73387f07a8df6..b12dc493d84f2 100644
--- a/branches/sage/cephmds2/mon/MDSMonitor.cc
+++ b/branches/sage/cephmds2/mon/MDSMonitor.cc
@@ -73,7 +73,7 @@ void MDSMonitor::election_finished()
 void MDSMonitor::create_initial()
 {
   mdsmap.epoch = 0;  // until everyone boots
-  mdsmap.ctime = g_clock.now();
+  mdsmap.created = g_clock.now();
 
   mdsmap.encode(encoded_map);
 
@@ -229,6 +229,9 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m)
     mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from);
     mdsmap.mds_inc[from]++;
 
+    // someone (new) joined the cluster
+    mdsmap.same_inst_since = mdsmap.epoch+1;
+
     // starting -> creating|starting|replay
     if (mdsmap.is_degraded() &&
 	!mdsmap.is_failed(from)) {
@@ -276,6 +279,11 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m)
     dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from])
 	     << " -> " << MDSMap::get_state_name(state)
 	     << endl;
+    // did someone leave the cluster?
+    if (state == MDSMap::STATE_OUT && mdsmap.mds_state[from] != MDSMap::STATE_OUT) 
+      mdsmap.same_inst_since = mdsmap.epoch+1;
+
+    // change the state
     mdsmap.mds_state[from] = state;
     if (mdsmap.is_up(from))
       mdsmap.mds_state_seq[from] = seq;
-- 
2.39.5